Support WP2_ARGB_64 in WP2::ArgbBuffer

Read and write 16-bit PNGs.
Implement conversions from/to 16-bit samples.
No encoding/decoding support in WebP2 files for now.
Change alpha-premultiplication and bit depth scaling formulas to use
more precise divisions instead of bit shifts.
Add -8bits option to avifenc for convenience, to allow it to read
16-bit PNGs and encode them as 8-bit.
Fix PSNR formula for >8-bit samples.
Fix bypass not respecting the premultiplication field of the header.
Add tests.

Change-Id: Icea020405ef0b6cc586badd9e67e8f1a1e52ddcb
Reviewed-on: https://chromium-review.googlesource.com/c/codecs/libwebp2/+/6074517
Tested-by: WebM Builds <builds@webmproject.org>
Reviewed-by: Vincent Rabaud <vrabaud@google.com>
diff --git a/examples/cwp2.cc b/examples/cwp2.cc
index 147f3ba..77d7d18 100644
--- a/examples/cwp2.cc
+++ b/examples/cwp2.cc
@@ -77,6 +77,7 @@
   WP2::PictureHint pic_hint = WP2::HINT_NONE;
   bool crop = false;
   WP2::Rectangle crop_area;
+  bool downscale_to_8bits = false;
 };
 
 //------------------------------------------------------------------------------
@@ -103,6 +104,8 @@
                   WP2::EncoderConfig::kDefault.alpha_quality));
   opt.Add("-keep_unmultiplied",
           "No premultiplication is done (default when -q 100 -alpha_q 100).");
+  opt.Add("-8bits",
+          "Downscale high bit depth images to 8 bits before encoding.");
   opt.Add("-target_size <int>", "target size (in bytes)");
   opt.Add("-target_psnr <float>", "target PSNR (in dB. typically: 42)");
   opt.Add("-effort <int>",
@@ -328,9 +331,15 @@
     WP2::Data data;
     CHECK_STATUS(IoUtilReadFile(file_path, &data),
                  "Error! Cannot open input file '%s'", file_path);
-    CHECK_STATUS(WP2::ReadImage(data.bytes, data.size, rgb_buffer,
-                                WP2::FileFormat::AUTO, params.log_level),
-                 "Error! Cannot read input file '%s'", file_path);
+    WP2::ArgbBuffer rgb_buffer_16(WP2_ARGB_64);
+    CHECK_STATUS(
+        WP2::ReadImage(data.bytes, data.size,
+                       params.downscale_to_8bits ? &rgb_buffer_16 : rgb_buffer,
+                       WP2::FileFormat::AUTO, params.log_level),
+        "Error! Cannot read input file '%s'", file_path);
+    if (params.downscale_to_8bits) {
+      WP2_CHECK_STATUS(rgb_buffer->ConvertFrom(rgb_buffer_16));
+    }
     if (i == 0) {
       swap(metadata, rgb_buffer->metadata_);  // save for later call
     } else if (!rgb_buffer->metadata_.IsEmpty()) {
@@ -375,8 +384,10 @@
                         WP2::Writer* const writer) {
   // Read input image.
   double start = GetStopwatchTime();
-  WP2::ImageReader image_reader(file_path, buffer, WP2::FileFormat::AUTO,
-                                params.log_level);
+  WP2::ArgbBuffer buffer_16(WP2_ARGB_64);
+  WP2::ImageReader image_reader(file_path,
+                                params.downscale_to_8bits ? &buffer_16 : buffer,
+                                WP2::FileFormat::AUTO, params.log_level);
   // 'buffer' must not be modified between calls to ReadFrame(), so we use a
   // proxy.
   WP2::ArgbBuffer frame(buffer->format());
@@ -390,7 +401,11 @@
                "Error! There are too many frames in '%s'", file_path);
     CHECK_STATUS(image_reader.ReadFrame(&is_last_frame, &duration_ms),
                  "Error! Cannot read input '%s'", file_path);
-    CHECK_STATUS(frame.SetView(*buffer), "Error! Cannot set view.");
+    if (params.downscale_to_8bits) {
+      WP2_CHECK_STATUS(frame.ConvertFrom(buffer_16));
+    } else {
+      CHECK_STATUS(frame.SetView(*buffer), "Error! Cannot set view.");
+    }
     if (i == 0) {
       swap(metadata, buffer->metadata_);  // save for later call
     } else if (!buffer->metadata_.IsEmpty()) {
@@ -601,6 +616,8 @@
       params.config.alpha_quality = ExUtilGetFloat(argv[++c], &parse_error);
     } else if (!strcmp(argv[c], "-keep_unmultiplied")) {
       params.config.keep_unmultiplied = true;
+    } else if (!strcmp(argv[c], "-8bits")) {
+      params.downscale_to_8bits = true;
     } else if (!strcmp(argv[c], "-csp")) {
       NEED_ARGS(1);
       params.config.csp_type = (WP2::Csp)ExUtilGetInt(argv[++c], &parse_error);
diff --git a/imageio/pngdec.cc b/imageio/pngdec.cc
index 135e5f2..7a63a58 100644
--- a/imageio/pngdec.cc
+++ b/imageio/pngdec.cc
@@ -376,8 +376,9 @@
                  WP2_STATUS_BITSTREAM_ERROR);
     WP2_CHECK_STATUS(CheckDimensions((uint32_t)width, (uint32_t)height));
 
-    png_set_strip_16(png);
+    // Use 1 byte per pixel in 1, 2, or 4-bit depth files.
     png_set_packing(png);
+    // Expand data to 24-bit RGB.
     if (color_type == PNG_COLOR_TYPE_PALETTE) {
       png_set_palette_to_rgb(png);
     } else if (color_type == PNG_COLOR_TYPE_GRAY ||
@@ -386,9 +387,30 @@
       png_set_gray_to_rgb(png);
     }
 
+    bool has_alpha = color_type & PNG_COLOR_MASK_ALPHA;
     if (png_get_valid(png, head_info, PNG_INFO_tRNS)) {
       png_set_tRNS_to_alpha(png);
+      has_alpha = true;
     }
+    if (has_alpha) {
+      png_set_swap_alpha(png);  // ARGB order is preferred in libwebp2.
+    }
+
+    if (bit_depth <= 8) {
+      bit_depth = 8;
+    } else {
+      WP2_CHECK_OK(bit_depth == 16, WP2_STATUS_UNSUPPORTED_FEATURE);
+      png_set_swap(png);  // From big-endian to little-endian.
+      if (!has_alpha) {
+        // There is no WP2_RGB_48. Make it WP2_ARGB_64.
+        png_set_add_alpha(png, 65535, PNG_FILLER_BEFORE);
+        has_alpha = true;
+      }
+    }
+    // Refuse to decode if the PNG bit depth is greater than the user buffer bit
+    // depth, to avoid silently losing information.
+    WP2_CHECK_OK(static_cast<int>(WP2Formatbpc(buffer_->format())) >= bit_depth,
+                 WP2_STATUS_INVALID_PARAMETER);
 
     // Apply gamma correction if needed.
     double image_gamma = 1 / 2.2;
@@ -421,12 +443,14 @@
     png_read_update_info(png, head_info);
 
     const uint32_t num_channels = png_get_channels(png, head_info);
-    if (num_channels != 3 && num_channels != 4) {
-      return WP2_STATUS_BITSTREAM_ERROR;
-    }
-    const uint32_t depth = num_channels * sizeof(*tmp_rgb_);
+    WP2_CHECK_OK((num_channels == 4) == has_alpha,
+                 WP2_STATUS_INVALID_COLORSPACE);
+    // Verify the output is an existing WP2SampleFormat.
+    WP2_CHECK_OK((bit_depth == 8 && num_channels == 3) || num_channels == 4,
+                 WP2_STATUS_UNSUPPORTED_FEATURE);
+    const uint32_t bytes_per_pixel = num_channels * (bit_depth / 8);
     size_t stride = 0;
-    WP2_CHECK_STATUS(MultFitsIn(width, depth, &stride));
+    WP2_CHECK_STATUS(MultFitsIn(width, bytes_per_pixel, &stride));
     // Make sure whole size fits in size_t.
     WP2_CHECK_STATUS(MultFitsIn<size_t>(stride, height));
 
@@ -442,7 +466,9 @@
         png_read_row(png, row, NULL);
         if (p == num_passes - 1) {
           WP2_CHECK_STATUS(buffer_->ImportRow(
-              (num_channels == 4) ? WP2_RGBA_32 : WP2_RGB_24, y, row));
+              bit_depth == 8 ? (num_channels == 4 ? WP2_ARGB_32 : WP2_RGB_24)
+                             : WP2_ARGB_64,
+              y, row));
         }
         if (num_passes > 1) row += stride;
       }
diff --git a/imageio/pngenc.cc b/imageio/pngenc.cc
index 08bab6b..fe0a7db 100644
--- a/imageio/pngenc.cc
+++ b/imageio/pngenc.cc
@@ -182,15 +182,17 @@
 WP2Status WritePNG(const ArgbBuffer& buffer, FILE* fout, const char*, bool,
                    bool, size_t*) {
   WP2_CHECK_OK(fout != nullptr, WP2_STATUS_NULL_PARAMETER);
-  WP2_CHECK_OK(buffer.format() == WP2_Argb_32 || buffer.format() == WP2_ARGB_32,
+  // TODO(yguyon): Allow more formats and convert only when necessary.
+  WP2_CHECK_OK(buffer.format() == WP2_Argb_32 ||
+                   buffer.format() == WP2_ARGB_32 ||
+                   buffer.format() == WP2_ARGB_64,
                WP2_STATUS_UNSUPPORTED_FEATURE);
 
   const uint32_t width = buffer.width();
   const uint32_t height = buffer.height();
-  const bool has_alpha = buffer.HasTransparency();
-  volatile png_bytep row = (png_bytep)buffer.GetRow(0);
-  Vector_u8 ARGB;
-  WP2_CHECK_ALLOC_OK(ARGB.resize(width * (has_alpha ? 4u : 3u)));
+  // Not just HasTransparency() because there is no WP2_RGB_48 for now.
+  const bool has_alpha =
+      buffer.format() == WP2_ARGB_64 || buffer.HasTransparency();
   volatile png_structp png;
   volatile png_infop info;
 
@@ -208,25 +210,40 @@
     return WP2_STATUS_BAD_WRITE;
   }
   png_init_io(png, fout);
-  png_set_IHDR(png, info, width, height, 8,
+  png_set_IHDR(png, info, width, height, WP2Formatbpc(buffer.format()),
                has_alpha ? PNG_COLOR_TYPE_RGBA : PNG_COLOR_TYPE_RGB,
                PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT,
                PNG_FILTER_TYPE_DEFAULT);
   png_write_info(png, info);
+  if (buffer.format() == WP2_ARGB_64) {
+    png_set_swap(png);        // From big-endian to little-endian.
+    png_set_swap_alpha(png);  // From RGBA to ARGB.
+  }
 
-  WP2ArgbConverterInit();
-  for (png_uint_32 y = 0; y < height; ++y) {
-    if (buffer.format() == WP2_Argb_32) {
-      WP2ArgbConvertTo[has_alpha ? WP2_RGBA_32 : WP2_RGB_24](row, width,
-                                                             ARGB.data());
-    } else {
-      assert(buffer.format() == WP2_ARGB_32);
-      WP2ARGBConvertTo[has_alpha ? WP2_RGBA_32 : WP2_RGB_24](row, width,
-                                                             ARGB.data());
+  if (buffer.format() == WP2_Argb_32 || buffer.format() == WP2_ARGB_32) {
+    Vector_u8 ARGB;
+    WP2_CHECK_ALLOC_OK(ARGB.resize(width * WP2FormatBpp(buffer.format())));
+    WP2ArgbConverterInit();
+    for (png_uint_32 y = 0; y < height; ++y) {
+      volatile png_bytep const row = (png_bytep)buffer.GetRow(y);
+      if (buffer.format() == WP2_Argb_32) {
+        WP2ArgbConvertTo[has_alpha ? WP2_RGBA_32 : WP2_RGB_24](row, width,
+                                                               ARGB.data());
+      } else {
+        assert(buffer.format() == WP2_ARGB_32);
+        WP2ARGBConvertTo[has_alpha ? WP2_RGBA_32 : WP2_RGB_24](row, width,
+                                                               ARGB.data());
+      }
+      png_bytep ptr[1] = {ARGB.data()};
+      png_write_rows(png, ptr, 1);
     }
-    png_bytep ptr[1] = {ARGB.data()};
-    png_write_rows(png, ptr, 1);
-    row += buffer.stride();
+  } else {
+    assert(buffer.format() == WP2_ARGB_64);
+    for (png_uint_32 y = 0; y < height; ++y) {
+      volatile png_bytep const row = (png_bytep)buffer.GetRow(y);
+      png_bytep ptr[1] = {row};
+      png_write_rows(png, ptr, 1);
+    }
   }
   png_write_end(png, info);
   png_destroy_write_struct((png_structpp)&png, (png_infopp)&info);
diff --git a/src/common/color_precision.h b/src/common/color_precision.h
index f8df5fe..e915b8d 100644
--- a/src/common/color_precision.h
+++ b/src/common/color_precision.h
@@ -172,16 +172,33 @@
 
 // Returns the maximum value for a given channel.
 // 'channel' is the index of the channel in order ARGB.
-static inline uint64_t FormatMax(WP2SampleFormat format, uint32_t channel) {
-  if (format == WP2_Argb_32 || channel == 0) {
-    return WP2::kAlphaMax;
-  } else if (format == WP2_ARGB_32) {
-    return 255;
-  } else if (format == WP2_Argb_38) {
-    return (channel == 0) ? WP2::kAlphaMax : 1023;
+static inline uint32_t FormatMax(WP2SampleFormat format, uint32_t channel) {
+  if (channel == 0) {
+    // Alpha
+    switch (format) {
+      case WP2_Argb_32:
+      case WP2_ARGB_32:
+      case WP2_XRGB_32:
+      case WP2_rgbA_32:
+      case WP2_RGBA_32:
+      case WP2_RGBX_32:
+      case WP2_bgrA_32:
+      case WP2_BGRA_32:
+      case WP2_BGRX_32:
+      case WP2_RGB_24:
+      case WP2_BGR_24:
+      case WP2_Argb_38:  // Alpha in WP2_Argb_38 uses only 8 bits.
+        return WP2::kAlphaMax;
+      case WP2_ARGB_64:
+        return 65535;
+      default:
+        assert(false);
+        return 0;
+    }
+  } else {
+    // RGB
+    return (1u << WP2Formatbpc(format)) - 1u;
   }
-  assert(false);
-  return 0;
 }
 
 }  // namespace WP2
diff --git a/src/common/symbols.cc b/src/common/symbols.cc
index 01a09ad..6f5d6d5 100644
--- a/src/common/symbols.cc
+++ b/src/common/symbols.cc
@@ -631,14 +631,12 @@
                    int32_t maxima_range[4], uint32_t num_transforms) {
   Segment segments[4];
   // Define the original range according to the pixel format.
-  // Deal with alpha.
-  segments[0] = {0, ::WP2::kAlphaMax};
   const int32_t num_bits_alpha = ::WP2::kAlphaBits;
-  // Deal with other channels.
-  const int32_t num_bits_non_alpha = (format == WP2_Argb_38) ? 10 : 8;
-  for (uint32_t c = 1; c < 4; ++c) {
-    segments[c] = {0, (1 << num_bits_non_alpha) - 1};
+  const int32_t num_bits_non_alpha = static_cast<int32_t>(WP2Formatbpc(format));
+  for (uint32_t c = 0; c < 4; ++c) {
+    segments[c] = {0, static_cast<int32_t>(WP2::FormatMax(format, c))};
   }
+  assert(segments[0].max == WP2::kAlphaMax);  // WP2_ARGB_64 is unsupported.
   bool do_clamp = true;
   for (uint32_t i = 0; i < num_transforms; ++i) {
     if (headers[i].type == TransformType::kNum) break;
diff --git a/src/dec/anim_dec.cc b/src/dec/anim_dec.cc
index 8c149f9..f51ad71 100644
--- a/src/dec/anim_dec.cc
+++ b/src/dec/anim_dec.cc
@@ -122,6 +122,7 @@
       rgb_output->Fill(lft, color);
       rgb_output->Fill(rgt, color);
     } else {
+      assert(WP2Formatbpc(rgb_output->format()) == 10);
       rgb_output->Fill(top, features.background_color);
       rgb_output->Fill(bot, features.background_color);
       rgb_output->Fill(lft, features.background_color);
diff --git a/src/dec/bypass_dec.cc b/src/dec/bypass_dec.cc
index df3a970..02a4884 100644
--- a/src/dec/bypass_dec.cc
+++ b/src/dec/bypass_dec.cc
@@ -19,7 +19,6 @@
 
 #include <cassert>
 #include <cstdint>
-#include <cstring>
 
 #include "src/common/global_params.h"
 #include "src/common/header_enc_dec.h"
@@ -27,7 +26,6 @@
 #include "src/common/lossy/block_size.h"
 #include "src/dec/filters/block_map_filter.h"
 #include "src/dec/tile_dec.h"
-#include "src/dsp/math.h"
 #include "src/utils/data_source.h"
 #include "src/utils/utils.h"
 #include "src/wp2/base.h"
@@ -96,40 +94,18 @@
     DataSource::DataHandle handle;
     WP2_CHECK_OK(tile_->input->TryReadNext(bytes_per_row, &handle),
                  WP2_STATUS_NOT_ENOUGH_DATA);
-    if (tile_->rgb_output.format() == WP2_Argb_32 ||
-        tile_->rgb_output.format() == WP2_ARGB_32) {
-      uint8_t* const row = tile_->rgb_output.GetRow8(y);
-      if (gparams_->has_alpha_) {
-        std::memcpy(row, handle.GetBytes(), handle.GetSize());
-      } else {
-        for (uint32_t x = 0; x < tile_->rect.width; ++x) {
-          row[x * 4 + 0] = kAlphaMax;
-          std::memcpy(row + x * 4 + 1, handle.GetBytes() + x * 3, 3);
-        }
-      }
-    } else {
-      assert(tile_->rgb_output.format() == WP2_Argb_38);
-      uint16_t* const row = tile_->rgb_output.GetRow16(y);
-      if (gparams_->has_alpha_) {
-        for (uint32_t x = 0; x < tile_->rect.width; ++x) {
-          row[x * 4 + 0] = handle.GetBytes()[x * 4];
-          const uint8_t* const rgb = handle.GetBytes() + x * 4 + 1;
-          for (uint32_t i : {0, 1, 2}) {
-            row[x * 4 + 1 + i] =
-                ChangePrecision<uint16_t>(rgb[i], /*from=*/8, /*to=*/10);
-          }
-        }
-      } else {
-        for (uint32_t x = 0; x < tile_->rect.width; ++x) {
-          row[x * 4 + 0] = kAlphaMax;
-          const uint8_t* const rgb = handle.GetBytes() + x * 3;
-          for (uint32_t i : {0, 1, 2}) {
-            row[x * 4 + 1 + i] =
-                ChangePrecision<uint16_t>(rgb[i], /*from=*/8, /*to=*/10);
-          }
-        }
-      }
-    }
+    ArgbBuffer raw_row(
+        gparams_->has_alpha_
+            ? (features_->is_premultiplied ? WP2_Argb_32 : WP2_ARGB_32)
+            : WP2_RGB_24);
+    WP2_CHECK_STATUS(raw_row.SetExternal(
+        tile_->rect.width, 1, const_cast<uint8_t*>(handle.GetBytes()),
+        handle.GetSize()));
+
+    ArgbBuffer rgb_output_row(tile_->rgb_output.format());
+    WP2_CHECK_STATUS(rgb_output_row.SetView(tile_->rgb_output,
+                                            {0, y, tile_->rect.width, 1}));
+    WP2_CHECK_STATUS(rgb_output_row.ConvertFrom(raw_row));
     WP2_CHECK_STATUS(tile_->row_progress.AdvanceBy(1.));
   }
   return WP2_STATUS_OK;
@@ -139,6 +115,8 @@
   BitUnpacker bit_unpacker(tile_->input, "raw_pixels");
   const uint32_t num_bits_per_row =
       ((gparams_->has_alpha_ ? kAlphaBits : 0) + 3 * 10) * tile_->rect.width;
+  ArgbBuffer raw_row(WP2_Argb_38);
+  WP2_CHECK_STATUS(raw_row.Resize(tile_->rect.width, 1));
 
   for (tile_->num_decoded_rows = 0;
        tile_->num_decoded_rows < tile_->rect.height;
@@ -150,32 +128,19 @@
                  WP2_STATUS_NOT_ENOUGH_DATA);
 
     const uint32_t y = tile_->num_decoded_rows;
-    if (tile_->rgb_output.format() == WP2_Argb_32 ||
-        tile_->rgb_output.format() == WP2_ARGB_32) {
-      uint8_t* const row = tile_->rgb_output.GetRow8(y);
-      for (uint32_t x = 0; x < tile_->rect.width; ++x) {
-        row[x * 4 + 0] = gparams_->has_alpha_
-                             ? bit_unpacker.ReadBits(kAlphaBits, "alpha")
-                             : kAlphaMax;
-        row[x * 4 + 1] = ChangePrecision(bit_unpacker.ReadBits(10, "red"),
-                                         /*from=*/10, /*to=*/8);
-        row[x * 4 + 2] = ChangePrecision(bit_unpacker.ReadBits(10, "green"),
-                                         /*from=*/10, /*to=*/8);
-        row[x * 4 + 3] = ChangePrecision(bit_unpacker.ReadBits(10, "blue"),
-                                         /*from=*/10, /*to=*/8);
-      }
-    } else {
-      assert(tile_->rgb_output.format() == WP2_Argb_38);
-      uint16_t* const row = tile_->rgb_output.GetRow16(y);
-      for (uint32_t x = 0; x < tile_->rect.width; ++x) {
-        row[x * 4 + 0] = gparams_->has_alpha_
-                             ? bit_unpacker.ReadBits(kAlphaBits, "alpha")
-                             : kAlphaMax;
-        row[x * 4 + 1] = bit_unpacker.ReadBits(10, "red");
-        row[x * 4 + 2] = bit_unpacker.ReadBits(10, "green");
-        row[x * 4 + 3] = bit_unpacker.ReadBits(10, "blue");
-      }
+    uint16_t* raw_pixel = raw_row.GetRow16(0);
+    for (uint32_t x = 0; x < tile_->rect.width; ++x, raw_pixel += 4) {
+      raw_pixel[0] = gparams_->has_alpha_
+                         ? bit_unpacker.ReadBits(kAlphaBits, "alpha")
+                         : kAlphaMax;
+      raw_pixel[1] = bit_unpacker.ReadBits(10, "red");
+      raw_pixel[2] = bit_unpacker.ReadBits(10, "green");
+      raw_pixel[3] = bit_unpacker.ReadBits(10, "blue");
     }
+    ArgbBuffer rgb_output_row(tile_->rgb_output.format());
+    WP2_CHECK_STATUS(rgb_output_row.SetView(tile_->rgb_output,
+                                            {0, y, tile_->rect.width, 1}));
+    WP2_CHECK_STATUS(rgb_output_row.ConvertFrom(raw_row));
     WP2_CHECK_STATUS(tile_->row_progress.AdvanceBy(1.));
   }
 
diff --git a/src/dec/lossless/losslessi_dec.cc b/src/dec/lossless/losslessi_dec.cc
index 43c6ffe..e5acd9c 100644
--- a/src/dec/lossless/losslessi_dec.cc
+++ b/src/dec/lossless/losslessi_dec.cc
@@ -196,22 +196,30 @@
 template <typename T>
 uint32_t EmitRows(const int16_t* row_in, uint32_t mb_w, uint32_t mb_h,
                   bool has_alpha, bool image_is_premultiplied,
-                  bool tile_is_premultiplied, T* const out,
-                  uint32_t out_stride) {
+                  bool tile_is_premultiplied, T* const out, uint32_t out_stride,
+                  WP2SampleFormat out_format) {
   // If the whole image is expected to be unmultiplied, the lossless compression
   // algorithm cannot have used premultiplied samples.
   if (!image_is_premultiplied) assert(!tile_is_premultiplied);
+  assert(out_format == WP2_Argb_32 || out_format == WP2_ARGB_32 ||
+         out_format == WP2_Argb_38);
+  if (has_alpha) {
+    assert(WP2IsPremultiplied(out_format) == image_is_premultiplied);
+  }
+  const uint32_t max_value = WP2::FormatMax(out_format, 1);
 
   int lines = mb_h;
   T* row_out = out;
   while (lines-- > 0) {
     // TODO(vrabaud) support multiple output color spaces.
     for (uint32_t i = 0; i < 4 * mb_w; i += 4, row_in += 4) {
-      const uint8_t alpha = (uint8_t)row_in[0];
+      const uint8_t alpha = static_cast<uint8_t>(row_in[0]);
       if (!has_alpha) assert(alpha == WP2::kAlphaMax);
       row_out[i] = alpha;
-      if (!image_is_premultiplied || (alpha > 0 && tile_is_premultiplied)) {
-        // Keep unmultiplied or keep premultiplied.
+      if (!image_is_premultiplied || (alpha > 0 && tile_is_premultiplied) ||
+          alpha == WP2::kAlphaMax) {
+        // Keep unmultiplied, or keep premultiplied, or it does not matter
+        // because alpha is fully opaque.
         for (int j = 1; j <= 3; ++j) {
           row_out[i + j] = static_cast<T>(row_in[j]);
         }
@@ -219,7 +227,7 @@
         // Premultiply.
         for (int j = 1; j <= 3; ++j) {
           row_out[i + j] = static_cast<T>(
-              WP2::DivBy255(static_cast<uint32_t>(row_in[j]) * alpha));
+              std::min((row_in[j] * alpha + 127u) / 255u, max_value));
         }
       } else {
         // To optimize encoding, the encoder can store arbitrary rgb values when
@@ -332,12 +340,12 @@
       num_rows_out =
           EmitRows(rows_data, buf->width(), num_rows, gparams_->has_alpha_,
                    image_is_premultiplied_, tile_is_premultiplied_,
-                   buf->GetRow8(last_out_row_), buf->stride());
+                   buf->GetRow8(last_out_row_), buf->stride(), buf->format());
     } else {
       num_rows_out =
           EmitRows(rows_data, buf->width(), num_rows, gparams_->has_alpha_,
                    image_is_premultiplied_, tile_is_premultiplied_,
-                   buf->GetRow16(last_out_row_), buf->stride());
+                   buf->GetRow16(last_out_row_), buf->stride(), buf->format());
     }
     // Update 'last_out_row_'.
     last_out_row_ += num_rows_out;
diff --git a/src/dsp/argb_converter.cc b/src/dsp/argb_converter.cc
index 6312504..39c352f 100644
--- a/src/dsp/argb_converter.cc
+++ b/src/dsp/argb_converter.cc
@@ -23,7 +23,6 @@
 #include <cstring>
 
 #include "src/dsp/dsp.h"
-#include "src/dsp/math.h"
 #include "src/wp2/base.h"
 
 namespace {
@@ -39,43 +38,37 @@
 //------------------------------------------------------------------------------
 // C conversion from a format to another
 
-// Demultiplies color 'v' by alpha 'a'. Set 'kIs8b' to true if 'v' fits in 8
-// bits (and will fit afterwards) so that a faster version is used. 'a' is 8b.
-template <typename Type, bool kIs8b = (sizeof(Type) == 1)>
-void Unmult(uint32_t a, Type v[]) {
+// Demultiplies color 'v' by alpha 'a'.
+template <typename Type, int kAlphaMax, int kColorMax>
+void Unmult(uint32_t a, Type& r, Type& g, Type& b) {
   if (a == 0) {
-    v[0] = v[1] = v[2] = 0;
+    r = g = b = 0;
     return;
   }
-  if (a == 255) return;
+  if (a == kAlphaMax) return;
 
-  if (kIs8b) {
-    const uint32_t M = WP2::kAlphaDiv[a];
-    for (int c : {0, 1, 2}) {
-      v[c] = std::min(WP2::DivByAlphaDiv(v[c], M), 255u);
-    }
-  } else {
-    // WP2::DivByAlphaDiv() cannot be used here because the result needs to
-    // be clamped to 255.
-    const uint32_t round = a >> 1;
-    for (int c : {0, 1, 2}) {
-      v[c] = (v[c] * 255 + round) / a;
-    }
-  }
+  const uint32_t round = a >> 1;
+  r = static_cast<Type>(
+      std::min<uint32_t>((r * kAlphaMax + round) / a, kColorMax));
+  g = static_cast<Type>(
+      std::min<uint32_t>((g * kAlphaMax + round) / a, kColorMax));
+  b = static_cast<Type>(
+      std::min<uint32_t>((b * kAlphaMax + round) / a, kColorMax));
 }
 
-// Premultiplies color 'v' (8- or 10-bit) by alpha 'a' (8-bit).
-template <typename Type>
-void Premult(uint32_t a, Type v[]) {
+// Premultiplies color 'v' by alpha 'a'.
+template <typename Type, int kAlphaMax>
+void Premult(uint32_t a, Type& r, Type& g, Type& b) {
   if (a == 0) {
-    v[0] = v[1] = v[2] = 0;
+    r = g = b = 0;
     return;
   }
-  if (a == 255) return;
+  if (a == kAlphaMax) return;
 
-  for (int c : {0, 1, 2}) {
-    v[c] = WP2::DivBy255(v[c] * a);
-  }
+  constexpr uint32_t kAlphaRound = kAlphaMax >> 1;
+  r = static_cast<Type>((r * a + kAlphaRound) / kAlphaMax);
+  g = static_cast<Type>((g * a + kAlphaRound) / kAlphaMax);
+  b = static_cast<Type>((b * a + kAlphaRound) / kAlphaMax);
 }
 
 // Converts 'width' px from 'src_data' of 'SrcType' to 'dst_data' of 'DstType'.
@@ -83,104 +76,137 @@
 // Positive 'kMult' will premultiply the destination by the source alpha.
 // Negative 'kMult' will unmultiply the destination by the source alpha.
 // 'kFillAlpha' will replace the destination alpha by 255.
-template <typename SrcType, int kSrcA, int kSrcR, int kSrcG, int kSrcB,
-          int kSrcStep, typename DstType, int kDstA, int kDstR, int kDstG,
-          int kDstB, int kDstStep, int kMult, bool kFillAlpha>
-void Convert_C(const void* src_data, uint32_t width, void* dst_data) {
+template <typename SrcType, int kSrcDepth, int kSrcA, int kSrcR, int kSrcG,
+          int kSrcB, int kSrcStep, typename DstType, int kDstDepth, int kDstA,
+          int kDstR, int kDstG, int kDstB, int kDstStep, int kMult,
+          bool kFillAlpha>
+void Cvrt_C(const void* src_data, uint32_t width, void* dst_data) {
+  static_assert((kSrcDepth == 8 || kSrcDepth == 10 || kSrcDepth == 16) &&
+                    (kDstDepth == 8 || kDstDepth == 10 || kDstDepth == 16),
+                "Unimplemented");
+
+  constexpr uint32_t kSrcMax = (1u << kSrcDepth) - 1u;
+  constexpr uint32_t kSrcRound = kSrcMax >> 1;
+  constexpr uint32_t kDstMax = (1u << kDstDepth) - 1u;
+
+  // The alpha channel of WP2_Argb_38 (depth = 10) uses only 8 bits.
+  constexpr uint32_t kSrcAlphaMax = kSrcDepth <= 10 ? 255u : kSrcMax;
+  constexpr uint32_t kSrcAlphaRound = kSrcAlphaMax >> 1;
+  constexpr uint32_t kDstAlphaMax = kDstDepth <= 10 ? 255u : kDstMax;
+
   assert(src_data != dst_data);
-  const SrcType* src = (const SrcType*)src_data;
-  DstType* dst = (DstType*)dst_data;
+  const SrcType* src = reinterpret_cast<const SrcType*>(src_data);
+  DstType* dst = reinterpret_cast<DstType*>(dst_data);
   for (uint32_t i = 0; i < width; ++i, src += kSrcStep, dst += kDstStep) {
     if (kDstStep == 4) {
-      dst[kDstA] = (kFillAlpha ? 0xff : src[kSrcA]);
+      if (kFillAlpha) {
+        dst[kDstA] = kDstAlphaMax;
+      } else if (kSrcAlphaMax == kDstAlphaMax) {
+        dst[kDstA] = src[kSrcA];
+      } else {
+        dst[kDstA] =
+            (src[kSrcA] * kDstAlphaMax + kSrcAlphaRound) / kSrcAlphaMax;
+      }
     }
     uint32_t tmp[3] = {src[kSrcR], src[kSrcG], src[kSrcB]};
-    if (sizeof(SrcType) < sizeof(DstType)) {
+    if (kSrcDepth < kDstDepth) {
       for (uint32_t& channel : tmp) {
-        channel = WP2::ChangePrecision(channel, /*from=*/8, /*to=*/10);
+        channel = (channel * kDstMax + kSrcRound) / kSrcMax;
       }
     }
+    // Premul or unmul by alpha at the highest bit depth to keep accuracy.
+    constexpr uint32_t kTmpMax = kSrcDepth < kDstDepth ? kDstMax : kSrcMax;
     if (kMult > 0) {
-      Premult(src[kSrcA], tmp);
+      Premult<uint32_t, kSrcAlphaMax>(src[kSrcA], tmp[0], tmp[1], tmp[2]);
     } else if (kMult < 0) {
-      Unmult<uint32_t, /*kIs8b=*/(sizeof(SrcType) == 1 &&
-                                  sizeof(DstType) == 1)>(src[kSrcA], tmp);
+      Unmult<uint32_t, kSrcAlphaMax, kTmpMax>(src[kSrcA], tmp[0], tmp[1],
+                                              tmp[2]);
     }
-    if (sizeof(SrcType) > sizeof(DstType)) {
+    if (kSrcDepth > kDstDepth) {
       for (uint32_t& channel : tmp) {
-        channel = WP2::ChangePrecision(channel, /*from=*/10, /*to=*/8);
+        channel = (channel * kDstMax + kSrcRound) / kSrcMax;
       }
     }
-    dst[kDstR] = tmp[0], dst[kDstG] = tmp[1], dst[kDstB] = tmp[2];
+    dst[kDstR] = static_cast<DstType>(tmp[0]);
+    dst[kDstG] = static_cast<DstType>(tmp[1]);
+    dst[kDstB] = static_cast<DstType>(tmp[2]);
   }
 }
 
 // From the specified format to Argb32 or ARGB32.
-#define CONVERT_TO_ARGB32(SRC_TYPE, A, R, G, B, MULT, FILL, SRC_STEP) \
-  Convert_C<SRC_TYPE, A, R, G, B, SRC_STEP, uint8_t, 0, 1, 2, 3, 4, MULT, FILL>
+#define CVRT_TO_ARGB32(SRC_TYPE, SRC_DEPTH, A, R, G, B, MULT, FILL, SRC_STEP)  \
+  Cvrt_C<SRC_TYPE, SRC_DEPTH, A, R, G, B, SRC_STEP, uint8_t, 8, 0, 1, 2, 3, 4, \
+         MULT, FILL>
 
 // From the Argb32 or ARGB32 to specified format.
-#define CONVERT_ARGB32_TO(DST_TYPE, A, R, G, B, MULT, FILL, DST_STEP) \
-  Convert_C<uint8_t, 0, 1, 2, 3, 4, DST_TYPE, A, R, G, B, DST_STEP, MULT, FILL>
+#define CVRT_ARGB32_TO(DST_TYPE, DST_DEPTH, A, R, G, B, MULT, FILL, DST_STEP)  \
+  Cvrt_C<uint8_t, 8, 0, 1, 2, 3, 4, DST_TYPE, DST_DEPTH, A, R, G, B, DST_STEP, \
+         MULT, FILL>
 
-#define Convert_Argb32_Argb32_C CONVERT_TO_ARGB32(uint8_t, 0, 1, 2, 3, 0, 0, 4)
-#define Convert_ARGB32_ARGB32_C CONVERT_TO_ARGB32(uint8_t, 0, 1, 2, 3, 0, 0, 4)
+#define Cvrt_Argb32_Argb32_C CVRT_TO_ARGB32(uint8_t, 8, 0, 1, 2, 3, 0, 0, 4)
+#define Cvrt_ARGB32_ARGB32_C CVRT_TO_ARGB32(uint8_t, 8, 0, 1, 2, 3, 0, 0, 4)
 
-#define Convert_ARGB32_Argb32_C CONVERT_TO_ARGB32(uint8_t, 0, 1, 2, 3, 1, 0, 4)
-#define Convert_XRGB32_Argb32_C CONVERT_TO_ARGB32(uint8_t, 0, 1, 2, 3, 0, 1, 4)
-#define Convert_rgbA32_Argb32_C CONVERT_TO_ARGB32(uint8_t, 3, 0, 1, 2, 0, 0, 4)
-#define Convert_RGBA32_Argb32_C CONVERT_TO_ARGB32(uint8_t, 3, 0, 1, 2, 1, 0, 4)
-#define Convert_RGBX32_Argb32_C CONVERT_TO_ARGB32(uint8_t, 3, 0, 1, 2, 0, 1, 4)
-#define Convert_bgrA32_Argb32_C CONVERT_TO_ARGB32(uint8_t, 3, 2, 1, 0, 0, 0, 4)
-#define Convert_BGRA32_Argb32_C CONVERT_TO_ARGB32(uint8_t, 3, 2, 1, 0, 1, 0, 4)
-#define Convert_BGRX32_Argb32_C CONVERT_TO_ARGB32(uint8_t, 3, 2, 1, 0, 0, 1, 4)
-#define Convert_RGB32_Argb32_C CONVERT_TO_ARGB32(uint8_t, 0, 0, 1, 2, 0, 1, 3)
-#define Convert_BGR32_Argb32_C CONVERT_TO_ARGB32(uint8_t, 0, 2, 1, 0, 0, 1, 3)
+#define Cvrt_ARGB32_Argb32_C CVRT_TO_ARGB32(uint8_t, 8, 0, 1, 2, 3, 1, 0, 4)
+#define Cvrt_XRGB32_Argb32_C CVRT_TO_ARGB32(uint8_t, 8, 0, 1, 2, 3, 0, 1, 4)
+#define Cvrt_rgbA32_Argb32_C CVRT_TO_ARGB32(uint8_t, 8, 3, 0, 1, 2, 0, 0, 4)
+#define Cvrt_RGBA32_Argb32_C CVRT_TO_ARGB32(uint8_t, 8, 3, 0, 1, 2, 1, 0, 4)
+#define Cvrt_RGBX32_Argb32_C CVRT_TO_ARGB32(uint8_t, 8, 3, 0, 1, 2, 0, 1, 4)
+#define Cvrt_bgrA32_Argb32_C CVRT_TO_ARGB32(uint8_t, 8, 3, 2, 1, 0, 0, 0, 4)
+#define Cvrt_BGRA32_Argb32_C CVRT_TO_ARGB32(uint8_t, 8, 3, 2, 1, 0, 1, 0, 4)
+#define Cvrt_BGRX32_Argb32_C CVRT_TO_ARGB32(uint8_t, 8, 3, 2, 1, 0, 0, 1, 4)
+#define Cvrt_RGB32_Argb32_C CVRT_TO_ARGB32(uint8_t, 8, 0, 0, 1, 2, 0, 1, 3)
+#define Cvrt_BGR32_Argb32_C CVRT_TO_ARGB32(uint8_t, 8, 0, 2, 1, 0, 0, 1, 3)
 
-#define Convert_XRGB32_ARGB32_C CONVERT_TO_ARGB32(uint8_t, 0, 1, 2, 3, 0, 1, 4)
-#define Convert_rgbA32_ARGB32_C CONVERT_TO_ARGB32(uint8_t, 3, 0, 1, 2, -1, 0, 4)
-#define Convert_RGBA32_ARGB32_C CONVERT_TO_ARGB32(uint8_t, 3, 0, 1, 2, 0, 0, 4)
-#define Convert_RGBX32_ARGB32_C CONVERT_TO_ARGB32(uint8_t, 3, 0, 1, 2, 0, 1, 4)
-#define Convert_bgrA32_ARGB32_C CONVERT_TO_ARGB32(uint8_t, 3, 2, 1, 0, -1, 0, 4)
-#define Convert_BGRA32_ARGB32_C CONVERT_TO_ARGB32(uint8_t, 3, 2, 1, 0, 0, 0, 4)
-#define Convert_BGRX32_ARGB32_C CONVERT_TO_ARGB32(uint8_t, 3, 2, 1, 0, 0, 1, 4)
-#define Convert_RGB32_ARGB32_C CONVERT_TO_ARGB32(uint8_t, 0, 0, 1, 2, 0, 1, 3)
-#define Convert_BGR32_ARGB32_C CONVERT_TO_ARGB32(uint8_t, 0, 2, 1, 0, 0, 1, 3)
+#define Cvrt_XRGB32_ARGB32_C CVRT_TO_ARGB32(uint8_t, 8, 0, 1, 2, 3, 0, 1, 4)
+#define Cvrt_rgbA32_ARGB32_C CVRT_TO_ARGB32(uint8_t, 8, 3, 0, 1, 2, -1, 0, 4)
+#define Cvrt_RGBA32_ARGB32_C CVRT_TO_ARGB32(uint8_t, 8, 3, 0, 1, 2, 0, 0, 4)
+#define Cvrt_RGBX32_ARGB32_C CVRT_TO_ARGB32(uint8_t, 8, 3, 0, 1, 2, 0, 1, 4)
+#define Cvrt_bgrA32_ARGB32_C CVRT_TO_ARGB32(uint8_t, 8, 3, 2, 1, 0, -1, 0, 4)
+#define Cvrt_BGRA32_ARGB32_C CVRT_TO_ARGB32(uint8_t, 8, 3, 2, 1, 0, 0, 0, 4)
+#define Cvrt_BGRX32_ARGB32_C CVRT_TO_ARGB32(uint8_t, 8, 3, 2, 1, 0, 0, 1, 4)
+#define Cvrt_RGB32_ARGB32_C CVRT_TO_ARGB32(uint8_t, 8, 0, 0, 1, 2, 0, 1, 3)
+#define Cvrt_BGR32_ARGB32_C CVRT_TO_ARGB32(uint8_t, 8, 0, 2, 1, 0, 0, 1, 3)
 
-#define Convert_Argb32_ARGB32_C CONVERT_ARGB32_TO(uint8_t, 0, 1, 2, 3, -1, 0, 4)
-#define Convert_Argb32_XRGB32_C CONVERT_ARGB32_TO(uint8_t, 0, 1, 2, 3, -1, 1, 4)
-#define Convert_Argb32_rgbA32_C CONVERT_ARGB32_TO(uint8_t, 3, 0, 1, 2, 0, 0, 4)
-#define Convert_Argb32_RGBA32_C CONVERT_ARGB32_TO(uint8_t, 3, 0, 1, 2, -1, 0, 4)
-#define Convert_Argb32_RGBX32_C CONVERT_ARGB32_TO(uint8_t, 3, 0, 1, 2, -1, 1, 4)
-#define Convert_Argb32_bgrA32_C CONVERT_ARGB32_TO(uint8_t, 3, 2, 1, 0, 0, 0, 4)
-#define Convert_Argb32_BGRA32_C CONVERT_ARGB32_TO(uint8_t, 3, 2, 1, 0, -1, 0, 4)
-#define Convert_Argb32_BGRX32_C CONVERT_ARGB32_TO(uint8_t, 3, 2, 1, 0, -1, 1, 4)
-#define Convert_Argb32_RGB32_C CONVERT_ARGB32_TO(uint8_t, 0, 0, 1, 2, -1, 1, 3)
-#define Convert_Argb32_BGR32_C CONVERT_ARGB32_TO(uint8_t, 0, 2, 1, 0, -1, 1, 3)
+#define Cvrt_Argb32_ARGB32_C CVRT_ARGB32_TO(uint8_t, 8, 0, 1, 2, 3, -1, 0, 4)
+#define Cvrt_Argb32_XRGB32_C CVRT_ARGB32_TO(uint8_t, 8, 0, 1, 2, 3, -1, 1, 4)
+#define Cvrt_Argb32_rgbA32_C CVRT_ARGB32_TO(uint8_t, 8, 3, 0, 1, 2, 0, 0, 4)
+#define Cvrt_Argb32_RGBA32_C CVRT_ARGB32_TO(uint8_t, 8, 3, 0, 1, 2, -1, 0, 4)
+#define Cvrt_Argb32_RGBX32_C CVRT_ARGB32_TO(uint8_t, 8, 3, 0, 1, 2, -1, 1, 4)
+#define Cvrt_Argb32_bgrA32_C CVRT_ARGB32_TO(uint8_t, 8, 3, 2, 1, 0, 0, 0, 4)
+#define Cvrt_Argb32_BGRA32_C CVRT_ARGB32_TO(uint8_t, 8, 3, 2, 1, 0, -1, 0, 4)
+#define Cvrt_Argb32_BGRX32_C CVRT_ARGB32_TO(uint8_t, 8, 3, 2, 1, 0, -1, 1, 4)
+#define Cvrt_Argb32_RGB32_C CVRT_ARGB32_TO(uint8_t, 8, 0, 0, 1, 2, -1, 1, 3)
+#define Cvrt_Argb32_BGR32_C CVRT_ARGB32_TO(uint8_t, 8, 0, 2, 1, 0, -1, 1, 3)
 
-#define Convert_ARGB32_XRGB32_C CONVERT_ARGB32_TO(uint8_t, 0, 1, 2, 3, 0, 1, 4)
-#define Convert_ARGB32_rgbA32_C CONVERT_ARGB32_TO(uint8_t, 3, 0, 1, 2, 1, 0, 4)
-#define Convert_ARGB32_RGBA32_C CONVERT_ARGB32_TO(uint8_t, 3, 0, 1, 2, 0, 0, 4)
-#define Convert_ARGB32_RGBX32_C CONVERT_ARGB32_TO(uint8_t, 3, 0, 1, 2, 0, 1, 4)
-#define Convert_ARGB32_bgrA32_C CONVERT_ARGB32_TO(uint8_t, 3, 2, 1, 0, 1, 0, 4)
-#define Convert_ARGB32_BGRA32_C CONVERT_ARGB32_TO(uint8_t, 3, 2, 1, 0, 0, 0, 4)
-#define Convert_ARGB32_BGRX32_C CONVERT_ARGB32_TO(uint8_t, 3, 2, 1, 0, 0, 1, 4)
-#define Convert_ARGB32_RGB32_C CONVERT_ARGB32_TO(uint8_t, 0, 0, 1, 2, 0, 1, 3)
-#define Convert_ARGB32_BGR32_C CONVERT_ARGB32_TO(uint8_t, 0, 2, 1, 0, 0, 1, 3)
+#define Cvrt_ARGB32_XRGB32_C CVRT_ARGB32_TO(uint8_t, 8, 0, 1, 2, 3, 0, 1, 4)
+#define Cvrt_ARGB32_rgbA32_C CVRT_ARGB32_TO(uint8_t, 8, 3, 0, 1, 2, 1, 0, 4)
+#define Cvrt_ARGB32_RGBA32_C CVRT_ARGB32_TO(uint8_t, 8, 3, 0, 1, 2, 0, 0, 4)
+#define Cvrt_ARGB32_RGBX32_C CVRT_ARGB32_TO(uint8_t, 8, 3, 0, 1, 2, 0, 1, 4)
+#define Cvrt_ARGB32_bgrA32_C CVRT_ARGB32_TO(uint8_t, 8, 3, 2, 1, 0, 1, 0, 4)
+#define Cvrt_ARGB32_BGRA32_C CVRT_ARGB32_TO(uint8_t, 8, 3, 2, 1, 0, 0, 0, 4)
+#define Cvrt_ARGB32_BGRX32_C CVRT_ARGB32_TO(uint8_t, 8, 3, 2, 1, 0, 0, 1, 4)
+#define Cvrt_ARGB32_RGB32_C CVRT_ARGB32_TO(uint8_t, 8, 0, 0, 1, 2, 0, 1, 3)
+#define Cvrt_ARGB32_BGR32_C CVRT_ARGB32_TO(uint8_t, 8, 0, 2, 1, 0, 0, 1, 3)
 
-#define Convert_Argb38_Argb32_C CONVERT_TO_ARGB32(uint16_t, 0, 1, 2, 3, 0, 0, 4)
-#define Convert_Argb38_ARGB32_C \
-  CONVERT_TO_ARGB32(uint16_t, 0, 1, 2, 3, -1, 0, 4)
-#define Convert_Argb32_Argb38_C CONVERT_ARGB32_TO(uint16_t, 0, 1, 2, 3, 0, 0, 4)
-#define Convert_ARGB32_Argb38_C CONVERT_ARGB32_TO(uint16_t, 0, 1, 2, 3, 1, 0, 4)
+#define Cvrt_Argb38_Argb32_C CVRT_TO_ARGB32(uint16_t, 10, 0, 1, 2, 3, 0, 0, 4)
+#define Cvrt_Argb38_ARGB32_C CVRT_TO_ARGB32(uint16_t, 10, 0, 1, 2, 3, -1, 0, 4)
+#define Cvrt_Argb32_Argb38_C CVRT_ARGB32_TO(uint16_t, 10, 0, 1, 2, 3, 0, 0, 4)
+#define Cvrt_ARGB32_Argb38_C CVRT_ARGB32_TO(uint16_t, 10, 0, 1, 2, 3, 1, 0, 4)
+
+#define Cvrt_ARGB64_Argb32_C CVRT_TO_ARGB32(uint16_t, 16, 0, 1, 2, 3, 1, 0, 4)
+#define Cvrt_ARGB64_ARGB32_C CVRT_TO_ARGB32(uint16_t, 16, 0, 1, 2, 3, 0, 0, 4)
+#define Cvrt_Argb32_ARGB64_C CVRT_ARGB32_TO(uint16_t, 16, 0, 1, 2, 3, -1, 0, 4)
+#define Cvrt_ARGB32_ARGB64_C CVRT_ARGB32_TO(uint16_t, 16, 0, 1, 2, 3, 0, 0, 4)
 
 //------------------------------------------------------------------------------
 // SSE4.1 implementation
 
 #if defined(WP2_USE_SSE)
 
-void Premultiply(uint8_t* dst, uint32_t width) {
-  for (uint32_t i = 0; i < width; ++i, dst += 4) Premult(dst[0], dst + 1);
+void PremultiplyARGB32(uint8_t* dst, uint32_t width) {
+  for (uint32_t x = 0; x < width; ++x, dst += 4) {
+    Premult<uint8_t, 255>(*dst, dst[1], dst[2], dst[3]);
+  }
 }
 
 #define PSHUFB_CST32(CST) \
@@ -221,99 +247,105 @@
     if (x < width) NAME##_C(src + 4 * x, width - x, dst + 4 * x);             \
   }
 
-void Convert_Argb32_Argb32_SSE(const void* src, uint32_t width, void* dst) {
+void Cvrt_Argb32_Argb32_SSE(const void* src, uint32_t width, void* dst) {
   memcpy(dst, src, 4 * width);
 }
 
-CONVERT_32_TO_32_FILL_FUNC(Convert_XRGB32_Argb32, 0x030201ff, 0x000000ffu)
-void Convert_ARGB32_Argb32_SSE(const void* src, uint32_t width, void* dst) {
-  Convert_Argb32_Argb32_SSE(src, width, dst);
-  Premultiply((uint8_t*)dst, width);
+CONVERT_32_TO_32_FILL_FUNC(Cvrt_XRGB32_Argb32, 0x030201ff, 0x000000ffu)
+void Cvrt_ARGB32_Argb32_SSE(const void* src, uint32_t width, void* dst) {
+  Cvrt_Argb32_Argb32_SSE(src, width, dst);  // ARGB32 -> ARGB32
+  PremultiplyARGB32((uint8_t*)dst, width);  // ARGB32 -> Argb32
 }
 
-CONVERT_32_TO_32_FUNC(Convert_rgbA32_Argb32, 0x02010003)
-CONVERT_32_TO_32_FILL_FUNC(Convert_RGBX32_Argb32, 0x020100ff, 0x000000ffu)
-void Convert_RGBA32_Argb32_SSE(const void* src, uint32_t width, void* dst) {
-  Convert_rgbA32_Argb32_SSE(src, width, dst);
-  Premultiply((uint8_t*)dst, width);
+CONVERT_32_TO_32_FUNC(Cvrt_rgbA32_Argb32, 0x02010003)
+CONVERT_32_TO_32_FILL_FUNC(Cvrt_RGBX32_Argb32, 0x020100ff, 0x000000ffu)
+void Cvrt_RGBA32_Argb32_SSE(const void* src, uint32_t width, void* dst) {
+  Cvrt_rgbA32_Argb32_SSE(src, width, dst);  // RGBA32 -> ARGB32
+  PremultiplyARGB32((uint8_t*)dst, width);  // ARGB32 -> Argb32
 }
 
-CONVERT_32_TO_32_FUNC(Convert_bgrA32_Argb32, 0x00010203)
-CONVERT_32_TO_32_FILL_FUNC(Convert_BGRX32_Argb32, 0x000102ff, 0x000000ffu)
-void Convert_BGRA32_Argb32_SSE(const void* src, uint32_t width, void* dst) {
-  Convert_bgrA32_Argb32_SSE(src, width, dst);
-  Premultiply((uint8_t*)dst, width);
+CONVERT_32_TO_32_FUNC(Cvrt_bgrA32_Argb32, 0x00010203)
+CONVERT_32_TO_32_FILL_FUNC(Cvrt_BGRX32_Argb32, 0x000102ff, 0x000000ffu)
+void Cvrt_BGRA32_Argb32_SSE(const void* src, uint32_t width, void* dst) {
+  Cvrt_bgrA32_Argb32_SSE(src, width, dst);  // BGRA32 -> ARGB32
+  PremultiplyARGB32((uint8_t*)dst, width);  // ARGB32 -> Argb32
 }
 
 // ConvertTo
 
-void Unmultiply(uint8_t* const dst, const uint8_t* dst_a, uint32_t width) {
-  for (uint32_t x = 0; x < width; ++x) {
-    Unmult(dst_a[4 * x], dst + 4 * x);
+void Unmultiply8b(const uint8_t* dst_a, uint8_t* dst_rgb, uint32_t width) {
+  for (uint32_t x = 0; x < width; ++x, dst_a += 4, dst_rgb += 4) {
+    Unmult<uint8_t, 255, 255>(dst_a[0], dst_rgb[0], dst_rgb[1], dst_rgb[2]);
   }
 }
-void FillAlpha(uint8_t* dst_a, uint32_t width) {
+void FillAlpha8b(uint8_t* dst_a, uint32_t width) {
   for (uint32_t x = 0; x < width; ++x) {
     dst_a[4 * x] = 255;
   }
 }
 
-void Convert_Argb32_ARGB32_SSE(const void* src, uint32_t width, void* dst) {
-  Convert_Argb32_Argb32_SSE(src, width, dst);
-  Unmultiply((uint8_t*)dst + 1, (uint8_t*)dst + 0, width);
+void Cvrt_Argb32_ARGB32_SSE(const void* src, uint32_t width, void* dst) {
+  Cvrt_Argb32_Argb32_SSE(src, width, dst);  // Argb32 -> Argb32
+  Unmultiply8b(const_cast<const uint8_t*>(reinterpret_cast<uint8_t*>(dst) + 0),
+               reinterpret_cast<uint8_t*>(dst) + 1,
+               width);  // Argb32 -> ARGB32
 }
-void Convert_Argb32_XRGB32_SSE(const void* src, uint32_t width, void* dst) {
-  Convert_Argb32_ARGB32_SSE(src, width, dst);
-  FillAlpha((uint8_t*)dst + 0, width);
+void Cvrt_Argb32_XRGB32_SSE(const void* src, uint32_t width, void* dst) {
+  Cvrt_Argb32_ARGB32_SSE(src, width, dst);                  // Argb32 -> ARGB32
+  FillAlpha8b(reinterpret_cast<uint8_t*>(dst) + 0, width);  // ARGB32 -> XRGB32
 }
 
-CONVERT_32_TO_32_FUNC(Convert_Argb32_rgbA32, 0x00030201)
-void Convert_Argb32_RGBA32_SSE(const void* src, uint32_t width, void* dst) {
-  Convert_Argb32_rgbA32_SSE(src, width, dst);
-  Unmultiply((uint8_t*)dst + 0, (uint8_t*)dst + 3, width);
+CONVERT_32_TO_32_FUNC(Cvrt_Argb32_rgbA32, 0x00030201)
+void Cvrt_Argb32_RGBA32_SSE(const void* src, uint32_t width, void* dst) {
+  Cvrt_Argb32_rgbA32_SSE(src, width, dst);  // Argb32 -> rgbA32
+  Unmultiply8b(const_cast<const uint8_t*>(reinterpret_cast<uint8_t*>(dst) + 3),
+               reinterpret_cast<uint8_t*>(dst) + 0,
+               width);  // rgbA32 -> RGBA32
 }
-void Convert_Argb32_RGBX32_SSE(const void* src, uint32_t width, void* dst) {
-  Convert_Argb32_RGBA32_SSE(src, width, dst);
-  FillAlpha((uint8_t*)dst + 3, width);
+void Cvrt_Argb32_RGBX32_SSE(const void* src, uint32_t width, void* dst) {
+  Cvrt_Argb32_RGBA32_SSE(src, width, dst);                  // Argb32 -> rgbA32
+  FillAlpha8b(reinterpret_cast<uint8_t*>(dst) + 3, width);  // rgbA32 -> RGBX32
 }
 
-CONVERT_32_TO_32_FUNC(Convert_Argb32_bgrA32, 0x00010203)
-void Convert_Argb32_BGRA32_SSE(const void* src, uint32_t width, void* dst) {
-  Convert_Argb32_bgrA32_SSE(src, width, dst);
-  Unmultiply((uint8_t*)dst + 0, (uint8_t*)dst + 3, width);
+CONVERT_32_TO_32_FUNC(Cvrt_Argb32_bgrA32, 0x00010203)
+void Cvrt_Argb32_BGRA32_SSE(const void* src, uint32_t width, void* dst) {
+  Cvrt_Argb32_bgrA32_SSE(src, width, dst);  // Argb32 -> bgrA32
+  Unmultiply8b(const_cast<const uint8_t*>(reinterpret_cast<uint8_t*>(dst) + 3),
+               reinterpret_cast<uint8_t*>(dst) + 0,
+               width);  // bgrA32 -> BGRA32
 }
-void Convert_Argb32_BGRX32_SSE(const void* src, uint32_t width, void* dst) {
-  Convert_Argb32_BGRA32_SSE(src, width, dst);
-  FillAlpha((uint8_t*)dst + 3, width);
+void Cvrt_Argb32_BGRX32_SSE(const void* src, uint32_t width, void* dst) {
+  Cvrt_Argb32_BGRA32_SSE(src, width, dst);                  // Argb32 -> bgrA32
+  FillAlpha8b(reinterpret_cast<uint8_t*>(dst) + 3, width);  // bgrA32 -> BGRX32
 }
 
 #undef CONVERT_32_TO_32_FUNC
 #undef PSHUFB_CST32
 
 WP2_TSAN_IGNORE_FUNCTION void ArgbConverterDspInitSSE() {
-  WP2ArgbConvertFrom[WP2_Argb_32] = Convert_Argb32_Argb32_SSE;
-  WP2ArgbConvertFrom[WP2_ARGB_32] = Convert_ARGB32_Argb32_SSE;
-  WP2ArgbConvertFrom[WP2_XRGB_32] = Convert_XRGB32_Argb32_SSE;
+  WP2ArgbConvertFrom[WP2_Argb_32] = Cvrt_Argb32_Argb32_SSE;
+  WP2ArgbConvertFrom[WP2_ARGB_32] = Cvrt_ARGB32_Argb32_SSE;
+  WP2ArgbConvertFrom[WP2_XRGB_32] = Cvrt_XRGB32_Argb32_SSE;
 
-  WP2ArgbConvertFrom[WP2_rgbA_32] = Convert_rgbA32_Argb32_SSE;
-  WP2ArgbConvertFrom[WP2_RGBA_32] = Convert_RGBA32_Argb32_SSE;
-  WP2ArgbConvertFrom[WP2_RGBX_32] = Convert_RGBX32_Argb32_SSE;
+  WP2ArgbConvertFrom[WP2_rgbA_32] = Cvrt_rgbA32_Argb32_SSE;
+  WP2ArgbConvertFrom[WP2_RGBA_32] = Cvrt_RGBA32_Argb32_SSE;
+  WP2ArgbConvertFrom[WP2_RGBX_32] = Cvrt_RGBX32_Argb32_SSE;
 
-  WP2ArgbConvertFrom[WP2_bgrA_32] = Convert_bgrA32_Argb32_SSE;
-  WP2ArgbConvertFrom[WP2_BGRA_32] = Convert_BGRA32_Argb32_SSE;
-  WP2ArgbConvertFrom[WP2_BGRX_32] = Convert_BGRX32_Argb32_SSE;
+  WP2ArgbConvertFrom[WP2_bgrA_32] = Cvrt_bgrA32_Argb32_SSE;
+  WP2ArgbConvertFrom[WP2_BGRA_32] = Cvrt_BGRA32_Argb32_SSE;
+  WP2ArgbConvertFrom[WP2_BGRX_32] = Cvrt_BGRX32_Argb32_SSE;
 
-  WP2ArgbConvertTo[WP2_Argb_32] = Convert_Argb32_Argb32_SSE;
-  WP2ArgbConvertTo[WP2_ARGB_32] = Convert_Argb32_ARGB32_SSE;
-  WP2ArgbConvertTo[WP2_XRGB_32] = Convert_Argb32_XRGB32_SSE;
+  WP2ArgbConvertTo[WP2_Argb_32] = Cvrt_Argb32_Argb32_SSE;
+  WP2ArgbConvertTo[WP2_ARGB_32] = Cvrt_Argb32_ARGB32_SSE;
+  WP2ArgbConvertTo[WP2_XRGB_32] = Cvrt_Argb32_XRGB32_SSE;
 
-  WP2ArgbConvertTo[WP2_rgbA_32] = Convert_Argb32_rgbA32_SSE;
-  WP2ArgbConvertTo[WP2_RGBA_32] = Convert_Argb32_RGBA32_SSE;
-  WP2ArgbConvertTo[WP2_RGBX_32] = Convert_Argb32_RGBX32_SSE;
+  WP2ArgbConvertTo[WP2_rgbA_32] = Cvrt_Argb32_rgbA32_SSE;
+  WP2ArgbConvertTo[WP2_RGBA_32] = Cvrt_Argb32_RGBA32_SSE;
+  WP2ArgbConvertTo[WP2_RGBX_32] = Cvrt_Argb32_RGBX32_SSE;
 
-  WP2ArgbConvertTo[WP2_bgrA_32] = Convert_Argb32_bgrA32_SSE;
-  WP2ArgbConvertTo[WP2_BGRA_32] = Convert_Argb32_BGRA32_SSE;
-  WP2ArgbConvertTo[WP2_BGRX_32] = Convert_Argb32_BGRX32_SSE;
+  WP2ArgbConvertTo[WP2_bgrA_32] = Cvrt_Argb32_bgrA32_SSE;
+  WP2ArgbConvertTo[WP2_BGRA_32] = Cvrt_Argb32_BGRA32_SSE;
+  WP2ArgbConvertTo[WP2_BGRX_32] = Cvrt_Argb32_BGRX32_SSE;
 }
 
 #endif  // WP2_USE_SSE
@@ -336,73 +368,81 @@
 WP2_TSAN_IGNORE_FUNCTION void WP2ArgbConverterInit() {
   if (argb_converter_last_cpuinfo_used == WP2GetCPUInfo) return;
 
-  WP2ArgbConvertFrom[WP2_Argb_32] = Convert_Argb32_Argb32_C;
-  WP2ArgbConvertFrom[WP2_ARGB_32] = Convert_ARGB32_Argb32_C;
-  WP2ArgbConvertFrom[WP2_XRGB_32] = Convert_XRGB32_Argb32_C;
+  WP2ArgbConvertFrom[WP2_Argb_32] = Cvrt_Argb32_Argb32_C;
+  WP2ArgbConvertFrom[WP2_ARGB_32] = Cvrt_ARGB32_Argb32_C;
+  WP2ArgbConvertFrom[WP2_XRGB_32] = Cvrt_XRGB32_Argb32_C;
 
-  WP2ArgbConvertFrom[WP2_rgbA_32] = Convert_rgbA32_Argb32_C;
-  WP2ArgbConvertFrom[WP2_RGBA_32] = Convert_RGBA32_Argb32_C;
-  WP2ArgbConvertFrom[WP2_RGBX_32] = Convert_RGBX32_Argb32_C;
+  WP2ArgbConvertFrom[WP2_rgbA_32] = Cvrt_rgbA32_Argb32_C;
+  WP2ArgbConvertFrom[WP2_RGBA_32] = Cvrt_RGBA32_Argb32_C;
+  WP2ArgbConvertFrom[WP2_RGBX_32] = Cvrt_RGBX32_Argb32_C;
 
-  WP2ArgbConvertFrom[WP2_bgrA_32] = Convert_bgrA32_Argb32_C;
-  WP2ArgbConvertFrom[WP2_BGRA_32] = Convert_BGRA32_Argb32_C;
-  WP2ArgbConvertFrom[WP2_BGRX_32] = Convert_BGRX32_Argb32_C;
+  WP2ArgbConvertFrom[WP2_bgrA_32] = Cvrt_bgrA32_Argb32_C;
+  WP2ArgbConvertFrom[WP2_BGRA_32] = Cvrt_BGRA32_Argb32_C;
+  WP2ArgbConvertFrom[WP2_BGRX_32] = Cvrt_BGRX32_Argb32_C;
 
-  WP2ArgbConvertFrom[WP2_RGB_24] = Convert_RGB32_Argb32_C;
-  WP2ArgbConvertFrom[WP2_BGR_24] = Convert_BGR32_Argb32_C;
+  WP2ArgbConvertFrom[WP2_RGB_24] = Cvrt_RGB32_Argb32_C;
+  WP2ArgbConvertFrom[WP2_BGR_24] = Cvrt_BGR32_Argb32_C;
 
-  WP2ArgbConvertFrom[WP2_Argb_38] = Convert_Argb38_Argb32_C;
+  WP2ArgbConvertFrom[WP2_Argb_38] = Cvrt_Argb38_Argb32_C;
 
-  WP2ArgbConvertTo[WP2_Argb_32] = Convert_Argb32_Argb32_C;
-  WP2ArgbConvertTo[WP2_ARGB_32] = Convert_Argb32_ARGB32_C;
-  WP2ArgbConvertTo[WP2_XRGB_32] = Convert_Argb32_XRGB32_C;
+  WP2ArgbConvertFrom[WP2_ARGB_64] = Cvrt_ARGB64_Argb32_C;
 
-  WP2ArgbConvertTo[WP2_rgbA_32] = Convert_Argb32_rgbA32_C;
-  WP2ArgbConvertTo[WP2_RGBA_32] = Convert_Argb32_RGBA32_C;
-  WP2ArgbConvertTo[WP2_RGBX_32] = Convert_Argb32_RGBX32_C;
+  WP2ArgbConvertTo[WP2_Argb_32] = Cvrt_Argb32_Argb32_C;
+  WP2ArgbConvertTo[WP2_ARGB_32] = Cvrt_Argb32_ARGB32_C;
+  WP2ArgbConvertTo[WP2_XRGB_32] = Cvrt_Argb32_XRGB32_C;
 
-  WP2ArgbConvertTo[WP2_bgrA_32] = Convert_Argb32_bgrA32_C;
-  WP2ArgbConvertTo[WP2_BGRA_32] = Convert_Argb32_BGRA32_C;
-  WP2ArgbConvertTo[WP2_BGRX_32] = Convert_Argb32_BGRX32_C;
+  WP2ArgbConvertTo[WP2_rgbA_32] = Cvrt_Argb32_rgbA32_C;
+  WP2ArgbConvertTo[WP2_RGBA_32] = Cvrt_Argb32_RGBA32_C;
+  WP2ArgbConvertTo[WP2_RGBX_32] = Cvrt_Argb32_RGBX32_C;
 
-  WP2ArgbConvertTo[WP2_RGB_24] = Convert_Argb32_RGB32_C;
-  WP2ArgbConvertTo[WP2_BGR_24] = Convert_Argb32_BGR32_C;
+  WP2ArgbConvertTo[WP2_bgrA_32] = Cvrt_Argb32_bgrA32_C;
+  WP2ArgbConvertTo[WP2_BGRA_32] = Cvrt_Argb32_BGRA32_C;
+  WP2ArgbConvertTo[WP2_BGRX_32] = Cvrt_Argb32_BGRX32_C;
 
-  WP2ArgbConvertTo[WP2_Argb_38] = Convert_Argb32_Argb38_C;
+  WP2ArgbConvertTo[WP2_RGB_24] = Cvrt_Argb32_RGB32_C;
+  WP2ArgbConvertTo[WP2_BGR_24] = Cvrt_Argb32_BGR32_C;
 
-  WP2ARGBConvertFrom[WP2_Argb_32] = Convert_Argb32_ARGB32_C;
-  WP2ARGBConvertFrom[WP2_ARGB_32] = Convert_ARGB32_ARGB32_C;
-  WP2ARGBConvertFrom[WP2_XRGB_32] = Convert_XRGB32_ARGB32_C;
+  WP2ArgbConvertTo[WP2_Argb_38] = Cvrt_Argb32_Argb38_C;
 
-  WP2ARGBConvertFrom[WP2_rgbA_32] = Convert_rgbA32_ARGB32_C;
-  WP2ARGBConvertFrom[WP2_RGBA_32] = Convert_RGBA32_ARGB32_C;
-  WP2ARGBConvertFrom[WP2_RGBX_32] = Convert_RGBX32_ARGB32_C;
+  WP2ArgbConvertTo[WP2_ARGB_64] = Cvrt_Argb32_ARGB64_C;
 
-  WP2ARGBConvertFrom[WP2_bgrA_32] = Convert_bgrA32_ARGB32_C;
-  WP2ARGBConvertFrom[WP2_BGRA_32] = Convert_BGRA32_ARGB32_C;
-  WP2ARGBConvertFrom[WP2_BGRX_32] = Convert_BGRX32_ARGB32_C;
+  WP2ARGBConvertFrom[WP2_Argb_32] = Cvrt_Argb32_ARGB32_C;
+  WP2ARGBConvertFrom[WP2_ARGB_32] = Cvrt_ARGB32_ARGB32_C;
+  WP2ARGBConvertFrom[WP2_XRGB_32] = Cvrt_XRGB32_ARGB32_C;
 
-  WP2ARGBConvertFrom[WP2_RGB_24] = Convert_RGB32_ARGB32_C;
-  WP2ARGBConvertFrom[WP2_BGR_24] = Convert_BGR32_ARGB32_C;
+  WP2ARGBConvertFrom[WP2_rgbA_32] = Cvrt_rgbA32_ARGB32_C;
+  WP2ARGBConvertFrom[WP2_RGBA_32] = Cvrt_RGBA32_ARGB32_C;
+  WP2ARGBConvertFrom[WP2_RGBX_32] = Cvrt_RGBX32_ARGB32_C;
 
-  WP2ARGBConvertFrom[WP2_Argb_38] = Convert_Argb38_ARGB32_C;
+  WP2ARGBConvertFrom[WP2_bgrA_32] = Cvrt_bgrA32_ARGB32_C;
+  WP2ARGBConvertFrom[WP2_BGRA_32] = Cvrt_BGRA32_ARGB32_C;
+  WP2ARGBConvertFrom[WP2_BGRX_32] = Cvrt_BGRX32_ARGB32_C;
 
-  WP2ARGBConvertTo[WP2_Argb_32] = Convert_ARGB32_Argb32_C;
-  WP2ARGBConvertTo[WP2_ARGB_32] = Convert_ARGB32_ARGB32_C;
-  WP2ARGBConvertTo[WP2_XRGB_32] = Convert_ARGB32_XRGB32_C;
+  WP2ARGBConvertFrom[WP2_RGB_24] = Cvrt_RGB32_ARGB32_C;
+  WP2ARGBConvertFrom[WP2_BGR_24] = Cvrt_BGR32_ARGB32_C;
 
-  WP2ARGBConvertTo[WP2_rgbA_32] = Convert_ARGB32_rgbA32_C;
-  WP2ARGBConvertTo[WP2_RGBA_32] = Convert_ARGB32_RGBA32_C;
-  WP2ARGBConvertTo[WP2_RGBX_32] = Convert_ARGB32_RGBX32_C;
+  WP2ARGBConvertFrom[WP2_Argb_38] = Cvrt_Argb38_ARGB32_C;
 
-  WP2ARGBConvertTo[WP2_bgrA_32] = Convert_ARGB32_bgrA32_C;
-  WP2ARGBConvertTo[WP2_BGRA_32] = Convert_ARGB32_BGRA32_C;
-  WP2ARGBConvertTo[WP2_BGRX_32] = Convert_ARGB32_BGRX32_C;
+  WP2ARGBConvertFrom[WP2_ARGB_64] = Cvrt_ARGB64_ARGB32_C;
 
-  WP2ARGBConvertTo[WP2_RGB_24] = Convert_ARGB32_RGB32_C;
-  WP2ARGBConvertTo[WP2_BGR_24] = Convert_ARGB32_BGR32_C;
+  WP2ARGBConvertTo[WP2_Argb_32] = Cvrt_ARGB32_Argb32_C;
+  WP2ARGBConvertTo[WP2_ARGB_32] = Cvrt_ARGB32_ARGB32_C;
+  WP2ARGBConvertTo[WP2_XRGB_32] = Cvrt_ARGB32_XRGB32_C;
 
-  WP2ARGBConvertTo[WP2_Argb_38] = Convert_ARGB32_Argb38_C;
+  WP2ARGBConvertTo[WP2_rgbA_32] = Cvrt_ARGB32_rgbA32_C;
+  WP2ARGBConvertTo[WP2_RGBA_32] = Cvrt_ARGB32_RGBA32_C;
+  WP2ARGBConvertTo[WP2_RGBX_32] = Cvrt_ARGB32_RGBX32_C;
+
+  WP2ARGBConvertTo[WP2_bgrA_32] = Cvrt_ARGB32_bgrA32_C;
+  WP2ARGBConvertTo[WP2_BGRA_32] = Cvrt_ARGB32_BGRA32_C;
+  WP2ARGBConvertTo[WP2_BGRX_32] = Cvrt_ARGB32_BGRX32_C;
+
+  WP2ARGBConvertTo[WP2_RGB_24] = Cvrt_ARGB32_RGB32_C;
+  WP2ARGBConvertTo[WP2_BGR_24] = Cvrt_ARGB32_BGR32_C;
+
+  WP2ARGBConvertTo[WP2_Argb_38] = Cvrt_ARGB32_Argb38_C;
+
+  WP2ARGBConvertTo[WP2_ARGB_64] = Cvrt_ARGB32_ARGB64_C;
 
   if (WP2GetCPUInfo != nullptr) {
 #if defined(WP2_USE_SSE)
@@ -419,20 +459,37 @@
 
 template <WP2SampleFormat from, WP2SampleFormat to>
 void WP2ConvertFromTo(const void* src, uint32_t width, void* dst) {
-  static_assert(WP2Formatbpc(from) == 8, "Unimplemented");
-  static_assert(WP2Formatbpc(to) == 8, "Unimplemented");
   const size_t src_bytes_per_pixel =
       WP2FormatBpp(static_cast<WP2SampleFormat>(from));
   const size_t dst_bytes_per_pixel =
       WP2FormatBpp(static_cast<WP2SampleFormat>(to));
 
-  // Use a temporary layout Argb or ARGB to go from SRC to DST.
+  // Use a temporary layout Argb32 or ARGB32 to go from src to dst.
+  // Bit depth changes can only go from 8-bit or to 8-bit.
+  static_assert(WP2Formatbpc(from) == 8 || WP2Formatbpc(to) == 8,
+                "Unimplemented");
+
   // Do not premultiply or unmultiply unless necessary.
-  // The functions below do not work inplace so use a temporary buffer.
+  // Premultiply or unmultiply during the bit depth change if any, to benefit
+  // from the increased precision. The other step of shuffling channels is
+  // lossless.
+  bool tmp_is_premul;
+  if (WP2Formatbpc(static_cast<WP2SampleFormat>(from)) <
+      WP2Formatbpc(static_cast<WP2SampleFormat>(to))) {
+    // For example, for going from 8-bit to 16-bit, it is better to first
+    // shuffle channels losslessly, then change the bit depth and unmul/premul.
+    tmp_is_premul = WP2IsPremultiplied(static_cast<WP2SampleFormat>(from));
+  } else {
+    // For example, for going from 16-bit to 8-bit, it is better to first change
+    // the bit depth and unmul/premul, then shuffle channels losslessly.
+    tmp_is_premul = WP2IsPremultiplied(static_cast<WP2SampleFormat>(to));
+  }
+
+  // The functions below do not work inplace so use an 8-bit temporary buffer.
   uint8_t tmp[128];
   while (width > 0) {
     const uint32_t num_pixels = std::min<uint32_t>(width, sizeof(tmp) / 4);
-    if (WP2IsPremultiplied(static_cast<WP2SampleFormat>(from))) {
+    if (tmp_is_premul) {
       WP2ArgbConvertFrom[from](src, num_pixels, tmp);
       WP2ArgbConvertTo[to](tmp, num_pixels, dst);
     } else {
@@ -447,7 +504,7 @@
 }
 
 template <WP2SampleFormat from>
-WP2ArgbConverterF WP2ConvertFromToFunc(WP2SampleFormat to) {
+WP2ArgbConverterF WP2ConvertFromTo8bFunc(WP2SampleFormat to) {
   if (to == WP2_XRGB_32) return WP2ConvertFromTo<from, WP2_XRGB_32>;
   if (to == WP2_rgbA_32) return WP2ConvertFromTo<from, WP2_rgbA_32>;
   if (to == WP2_RGBA_32) return WP2ConvertFromTo<from, WP2_RGBA_32>;
@@ -460,6 +517,13 @@
   return nullptr;
 }
 
+template <WP2SampleFormat from>
+WP2ArgbConverterF WP2ConvertFromToFunc(WP2SampleFormat to) {
+  if (to == WP2_Argb_38) return WP2ConvertFromTo<from, WP2_Argb_38>;
+  if (to == WP2_ARGB_64) return WP2ConvertFromTo<from, WP2_ARGB_64>;
+  return WP2ConvertFromTo8bFunc<from>(to);
+}
+
 }  // namespace
 
 //------------------------------------------------------------------------------
@@ -485,6 +549,10 @@
   if (from == WP2_BGRX_32) return WP2ConvertFromToFunc<WP2_BGRX_32>(to);
   if (from == WP2_RGB_24) return WP2ConvertFromToFunc<WP2_RGB_24>(to);
   if (from == WP2_BGR_24) return WP2ConvertFromToFunc<WP2_BGR_24>(to);
+  if (to < WP2_Argb_38) {
+    if (from == WP2_Argb_38) return WP2ConvertFromTo8bFunc<WP2_Argb_38>(to);
+    if (from == WP2_ARGB_64) return WP2ConvertFromTo8bFunc<WP2_ARGB_64>(to);
+  }
   return nullptr;
 }
 
diff --git a/src/enc/anim/anim_enc.cc b/src/enc/anim/anim_enc.cc
index 04a6c84..f57ccab 100644
--- a/src/enc/anim/anim_enc.cc
+++ b/src/enc/anim/anim_enc.cc
@@ -192,8 +192,7 @@
   const bool has_trailing_data =
       (metadata.xmp.size > 0) || (metadata.exif.size > 0);
 
-  const uint32_t rgb_bit_depth =
-      (first_frame.rgb_pixels.format() == WP2_Argb_38) ? 10 : 8;
+  const uint32_t rgb_bit_depth = WP2Formatbpc(first_frame.rgb_pixels.format());
   for (const Frame& frame : frames) {
     if (frame.rgb_pixels.format() != first_frame.rgb_pixels.format()) {
       assert(false);
diff --git a/src/enc/bypass_enc.cc b/src/enc/bypass_enc.cc
index 80c63cd..f7b116a 100644
--- a/src/enc/bypass_enc.cc
+++ b/src/enc/bypass_enc.cc
@@ -31,7 +31,8 @@
 
 //------------------------------------------------------------------------------
 
-WP2Status BypassTileEnc(const GlobalParams& gparams, EncTile* const tile,
+WP2Status BypassTileEnc(const GlobalParams& gparams,
+                        bool image_is_premultiplied, EncTile* const tile,
                         Writer* const output) {
   const uint32_t rgb_bit_depth = WP2Formatbpc(tile->rgb_input.format());
   const uint32_t max_num_bytes =
@@ -42,13 +43,32 @@
     // RGB input is expected.
     assert(!tile->rgb_input.IsEmpty());
     if (rgb_bit_depth <= 8) {
+      assert(tile->rgb_input.format() == WP2_Argb_32 ||
+             tile->rgb_input.format() == WP2_ARGB_32);
+      if (!image_is_premultiplied) {
+        // There should be no loss here.
+        assert(!WP2IsPremultiplied(tile->rgb_input.format()));
+      }
       for (uint32_t y = 0; y < tile->rect.height; ++y) {
-        const uint8_t* const row = tile->rgb_input.GetRow8(y);
+        ArgbBuffer input_row(tile->rgb_input.format());
+        WP2_CHECK_STATUS(
+            input_row.SetView(tile->rgb_input, {0, y, tile->rect.width, 1}));
+        // Respect the header's image_is_premultiplied flag, otherwise the
+        // decoder will not be able to know whether the samples were
+        // premultiplied. Each non-bypass tile explicitly signals that
+        // information on top of the image header.
+        ArgbBuffer raw_row(image_is_premultiplied ? WP2_Argb_32 : WP2_ARGB_32);
+        if (raw_row.format() == input_row.format()) {
+          WP2_CHECK_STATUS(raw_row.SetView(input_row));
+        } else {
+          WP2_CHECK_STATUS(raw_row.ConvertFrom(input_row));
+        }
+        const uint8_t* const row = raw_row.GetRow8(0);
         if (gparams.has_alpha_) {
-          // Write rows of Argb samples.
+          // Write rows of Argb or ARGB samples.
           WP2_CHECK_ALLOC_OK(output->Append((void*)row, tile->rect.width * 4));
         } else {
-          // Write only rgb.
+          // Write only RGB.
           for (uint32_t x = 0; x < tile->rect.width; ++x) {
             WP2_CHECK_ALLOC_OK(output->Append((void*)&row[x * 4 + 1], 3));
           }
@@ -56,7 +76,9 @@
       }
       // TODO(yguyon): Simulate ANSDebugPrefix to count these raw bytes?
     } else {
+      // Only premultiplied WP2_Argb_38 is available here.
       assert(tile->rgb_input.format() == WP2_Argb_38);
+      assert(image_is_premultiplied);
       // Allocate, fill and write the 'data' to 'output'.
       // TODO(yguyon): Use less memory by outputting 8 lines by 8 lines?
       Data data;
diff --git a/src/enc/distortion_enc.cc b/src/enc/distortion_enc.cc
index 44e98c5..591c365 100644
--- a/src/enc/distortion_enc.cc
+++ b/src/enc/distortion_enc.cc
@@ -24,6 +24,7 @@
 #include <cstdlib>
 #include <cstring>
 
+#include "src/common/color_precision.h"
 #include "src/dsp/dsp.h"
 #include "src/utils/csp.h"
 #include "src/utils/plane.h"
@@ -447,14 +448,18 @@
   result[0] = result[1] = result[2] = result[3] = result[4] = 0.f;
 
   const double pix_size = width() * height();
-  const double max = 255.;  // Maximum value per pixel per channel.
+  double total_max = 0.;  // Sum of the maximum values of each channel.
   double total_distortion = 0.;
   uint32_t alpha_channel_index = 0;
   WP2FormatHasAlpha(format_, &alpha_channel_index);
   const bool has_transparency = HasTransparency();
   const bool include_alpha = has_transparency && include_alpha_in_all;
-  const uint32_t pix_channel_size = pix_size * (include_alpha ? 4 : 3);
+  const uint32_t num_channels = include_alpha ? 4 : 3;
+  const uint32_t pix_channel_size = pix_size * num_channels;
   if (metric_type == PSNR) {
+    WP2_CHECK_OK(WP2FormatBpp(format_) == 3 || WP2FormatBpp(format_) == 4 ||
+                     WP2FormatBpp(format_) == 8,
+                 WP2_STATUS_UNSUPPORTED_FEATURE);
     WP2PSNRInit();
     auto metric_call = (WP2FormatBpp(format_) == 4) ? WP2SumSquaredError4x8u
                                                     : WP2SumSquaredError3x8u;
@@ -471,12 +476,17 @@
       }
     }
     for (uint32_t c = 0; c < 4; ++c) {
+      const double max = WP2::FormatMax(
+          ref.format(), (c == alpha_channel_index && include_alpha) ? 0 : 1);
       if (c != alpha_channel_index || include_alpha) {
+        total_max += max;
         total_distortion += (float)result_64b[c];
       }
       result[c] = (float)GetPSNR(result_64b[c], pix_size, max);
     }
-    result[4] = (float)GetPSNR(total_distortion, pix_channel_size, max);
+    // TODO: b/382677532 - Verify formula (maybe shift alpha to 10b)
+    result[4] = (float)GetPSNR(total_distortion, pix_channel_size,
+                               total_max / num_channels);
   } else if (metric_type == SSIM) {
     WP2_CHECK_OK(WP2FormatBpp(format_) == 4, WP2_STATUS_UNSUPPORTED_FEATURE);
     WP2SSIMInit();
@@ -524,12 +534,17 @@
     WP2_CHECK_OK(WP2FormatBpp(format_) == 4, WP2_STATUS_UNSUPPORTED_FEATURE);
     for (uint32_t c = 0; c < 4; ++c) {
       const double disto = AccumulateLSIM(*this, ref, c);
+      const double max = WP2::FormatMax(
+          ref.format(), (c == alpha_channel_index && include_alpha) ? 0 : 1);
       if (c != alpha_channel_index || include_alpha) {
+        total_max += max;
         total_distortion += disto;
       }
       result[c] = (float)GetPSNR(disto, pix_size, max);
     }
-    result[4] = (float)GetPSNR(total_distortion, pix_channel_size, max);
+    // TODO: b/382677532 - Verify formula (maybe shift alpha to 10b)
+    result[4] = (float)GetPSNR(total_distortion, pix_channel_size,
+                               total_max / num_channels);
   } else {
     // We are working in YUV space.
     CSPTransform transf;
@@ -546,10 +561,17 @@
       result[2] = CalcPSNRHVS(yuv0.U, yuv1.U, kMaskU);
       result[3] = CalcPSNRHVS(yuv0.V, yuv1.V, kMaskV);
       for (uint32_t c = 0; c < 4; ++c) {
-        if (c > 0 || include_alpha) total_distortion += result[c];
+        const double max =
+            WP2::FormatMax(ref.format(), (c == 0 && include_alpha) ? 0 : 1);
+        if (c > 0 || include_alpha) {
+          total_max += max;
+          total_distortion += result[c];
+        }
         result[c] = (float)GetPSNR(result[c], pix_size, max);
       }
-      result[4] = (float)GetPSNR(total_distortion, pix_channel_size, max);
+      // TODO: b/382677532 - Verify formula (maybe shift alpha to 10b)
+      result[4] = (float)GetPSNR(total_distortion, pix_channel_size,
+                                 total_max / num_channels);
     } else if (metric_type == PSNR_YUV) {
       WP2_CHECK_STATUS(
           yuv0.GetDistortion(yuv1, transf.GetYuvDepth(), PSNR, result));
diff --git a/src/enc/lossless/losslessi_enc.cc b/src/enc/lossless/losslessi_enc.cc
index 87aa58e..057c01a 100644
--- a/src/enc/lossless/losslessi_enc.cc
+++ b/src/enc/lossless/losslessi_enc.cc
@@ -27,6 +27,7 @@
 #include <memory>
 #include <string>
 
+#include "src/common/color_precision.h"
 #include "src/common/constants.h"
 #include "src/common/lossless/color_cache.h"
 #include "src/common/progress_watcher.h"
@@ -836,6 +837,7 @@
   if (!can_premultiply) assert(!config.use_premultiplied);
   argb_buffer_.has_alpha = has_alpha_;
   argb_buffer_.channel_bits = WP2Formatbpc(pic_.format());
+  const uint32_t max_value = WP2::FormatMax(pic_.format(), 1);
   for (uint32_t y = 0; y < height; ++y) {
     int16_t* const dst_row = argb_buffer_.GetRow(y);
     if (WP2Formatbpc(pic_.format()) <= 8) {
@@ -849,7 +851,8 @@
           } else if (config.use_premultiplied) {
             dst_row[4 * x + 0] = a;
             for (uint32_t c = 1; c < 4; ++c) {
-              dst_row[4 * x + c] = WP2::DivBy255(src_row[4 * x + c] * a);
+              dst_row[4 * x + c] = static_cast<int16_t>(
+                  std::min((src_row[4 * x + c] * a + 127u) / 255u, max_value));
             }
           } else {
             std::copy(src_row + 4 * x, src_row + 4 * (x + 1), dst_row + 4 * x);
diff --git a/src/enc/main_enc.cc b/src/enc/main_enc.cc
index 9d17db9..f937ee6 100644
--- a/src/enc/main_enc.cc
+++ b/src/enc/main_enc.cc
@@ -293,7 +293,7 @@
   WP2_CHECK_OK(config.IsValid(), WP2_STATUS_INVALID_CONFIGURATION);
 
   if (input.format() != WP2_Argb_32 && input.format() != WP2_ARGB_32 &&
-      input.format() != WP2_Argb_38) {
+      input.format() != WP2_Argb_38 && input.format() != WP2_ARGB_64) {
     ArgbBuffer converted(WP2IsPremultiplied(input.format()) ? WP2_Argb_32
                                                             : WP2_ARGB_32);
     WP2_CHECK_STATUS(converted.ConvertFrom(input));
diff --git a/src/enc/preview/preview_color.cc b/src/enc/preview/preview_color.cc
index 920b5c8..ec6d39c 100644
--- a/src/enc/preview/preview_color.cc
+++ b/src/enc/preview/preview_color.cc
@@ -145,7 +145,6 @@
                           WP2::DivBy255(row[x * 4 + 3] * alpha));
         }
       }
-
     } else {
       assert(WP2IsPremultiplied(canvas.format()));
       const uint16_t* const row = canvas.GetRow16(y);
diff --git a/src/enc/tile_enc.cc b/src/enc/tile_enc.cc
index 9e68a2c..eb616b0 100644
--- a/src/enc/tile_enc.cc
+++ b/src/enc/tile_enc.cc
@@ -206,7 +206,9 @@
     if (num_bytes >= max_num_bytes) {
       // Discard ANS encoding because it takes at least as many bytes as raw
       // pixels.
-      WP2_CHECK_STATUS(BypassTileEnc(*tiles_layout->gparams, &tile, output));
+      WP2_CHECK_STATUS(BypassTileEnc(*tiles_layout->gparams,
+                                     tiles_layout->image_is_premultiplied,
+                                     &tile, output));
     } else {
       if (tile.data.size > 0) {
         WP2_CHECK_ALLOC_OK(output->Append(tile.data.bytes, tile.data.size));
diff --git a/src/enc/tile_enc.h b/src/enc/tile_enc.h
index aeef3e3..afe81c6 100644
--- a/src/enc/tile_enc.h
+++ b/src/enc/tile_enc.h
@@ -98,7 +98,8 @@
 };
 
 // Encodes pixels as row-ordered raw samples.
-WP2Status BypassTileEnc(const GlobalParams& gparams, EncTile* tile,
+WP2Status BypassTileEnc(const GlobalParams& gparams,
+                        bool image_is_premultiplied, EncTile* tile,
                         Writer* output);
 
 //------------------------------------------------------------------------------
diff --git a/src/utils/argb_buffer.cc b/src/utils/argb_buffer.cc
index 66c640f..01da4c6 100644
--- a/src/utils/argb_buffer.cc
+++ b/src/utils/argb_buffer.cc
@@ -470,10 +470,11 @@
       if (WP2HasOtherValue8b32b(row, width_, 0xff)) return true;
     }
   } else {
+    const uint32_t alpha_max = FormatMax(format_, 0);
     for (uint32_t y = 0; y < height_; ++y) {
       const uint16_t* const row = GetRow16(y) + alpha_channel_index;
       for (uint32_t x = 0; x < width_; ++x) {
-        if (row[4 * x + 0] != WP2::kAlphaMax) return true;
+        if (row[4 * x + 0] != alpha_max) return true;
       }
     }
   }
diff --git a/src/wp2/base.h b/src/wp2/base.h
index 3899c82..c55032b 100644
--- a/src/wp2/base.h
+++ b/src/wp2/base.h
@@ -135,7 +135,9 @@
   // HDR format: 8 bits for A, 10 per RGB.
   WP2_Argb_38,
 
-  // TODO(skal): RGB565_16? rgbA4444_16? RGBA4444_16?
+  // 64b/pixel format: 16 bits for A, 16 per RGB.
+  WP2_ARGB_64,
+
   WP2_FORMAT_NUM
 } WP2SampleFormat;
 
@@ -156,6 +158,7 @@
     case WP2_BGR_24:
       return 3;
     case WP2_Argb_38:
+    case WP2_ARGB_64:
       // actually 38 bits, stored as uint16
       return 8;
     case WP2_FORMAT_NUM:
@@ -183,6 +186,8 @@
       return 8;
     case WP2_Argb_38:
       return 10;
+    case WP2_ARGB_64:
+      return 16;
     case WP2_FORMAT_NUM:
       assert(false);
       return 0;
@@ -201,6 +206,7 @@
     case WP2_Argb_32:
     case WP2_ARGB_32:
     case WP2_Argb_38:
+    case WP2_ARGB_64:
       if (alpha_channel_index != nullptr) *alpha_channel_index = 0;
       return true;
     case WP2_rgbA_32:
@@ -244,6 +250,7 @@
     case WP2_BGRX_32:
     case WP2_RGB_24:
     case WP2_BGR_24:
+    case WP2_ARGB_64:
       return false;
     case WP2_FORMAT_NUM:
       assert(false);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 1a753ff..a9e9e61 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -294,6 +294,7 @@
     add_wp2_test(test_distortion 4 extras tests_helpers_filter)
     add_wp2_test(test_extras 1 extras)
     add_wp2_test(test_imageio 1 extras)
+    add_wp2_test(test_imageio_16b 1 extras)
     add_wp2_test(test_imageio_conversion 1 extras)
     add_wp2_test(test_transf 1 extras)
     add_wp2_test(test_version 1 extras)
diff --git a/tests/lossless/test_group4.cc b/tests/lossless/test_group4.cc
index b1099db..687b6c0 100644
--- a/tests/lossless/test_group4.cc
+++ b/tests/lossless/test_group4.cc
@@ -27,6 +27,7 @@
 #include "src/wp2/decode.h"
 #include "src/wp2/encode.h"
 #include "src/wp2/format_constants.h"
+#include "gtest/gtest.h"
 #include "tests/include/helpers.h"
 
 namespace WP2 {
diff --git a/tests/test_api.cc b/tests/test_api.cc
index dbae6cb..4f8a1f2 100644
--- a/tests/test_api.cc
+++ b/tests/test_api.cc
@@ -72,7 +72,7 @@
       ((uint8_t*)input.GetRow(0))[i] = (uint8_t)0xFF;
     }
     const WP2Status status = Encode(input, &output);
-    if (format != WP2_Argb_38) {
+    if (format != WP2_Argb_38 && format != WP2_ARGB_64) {
       ASSERT_WP2_OK(status);
     } else {
       ASSERT_EQ(status, WP2_STATUS_INVALID_COLORSPACE);
diff --git a/tests/test_buffer.cc b/tests/test_buffer.cc
index b0bb2ab..7114ec8 100644
--- a/tests/test_buffer.cc
+++ b/tests/test_buffer.cc
@@ -497,11 +497,7 @@
       EXPECT_EQ(result[1], 99.f);  // r (or y for PSNRHVS)
       EXPECT_EQ(result[2], 99.f);  // g (or u for PSNRHVS)
       EXPECT_EQ(result[3], 99.f);  // b (or v for PSNRHVS)
-      if (metric == MetricType::PSNR_YUV) {
-        EXPECT_LT(result[4], 60.f);  // all
-      } else {
-        EXPECT_LT(result[4], 50.f);  // all
-      }
+      EXPECT_LT(result[4], 60.f);  // all
       // Not yet implemented.
       if (format == WP2_Argb_38) continue;
       ASSERT_WP2_OK(compressed.GetDistortionBlackOrWhiteBackground(
diff --git a/tests/test_bypass.cc b/tests/test_bypass.cc
index 8c5ec02..82a0550 100644
--- a/tests/test_bypass.cc
+++ b/tests/test_bypass.cc
@@ -28,6 +28,7 @@
 #include "src/wp2/decode.h"
 #include "src/wp2/encode.h"
 #include "src/wp2/format_constants.h"
+#include "gtest/gtest.h"
 
 namespace WP2 {
 namespace {
@@ -81,7 +82,8 @@
 
     ASSERT_WP2_OK(GlobalAnalysis(enc_tile.rgb_input, enc_tile.yuv_input, transf,
                                  config, &gparams));
-    ASSERT_WP2_OK(BypassTileEnc(gparams, &enc_tile, &writer));
+    ASSERT_WP2_OK(BypassTileEnc(gparams, WP2IsPremultiplied(src_format),
+                                &enc_tile, &writer));
   }
 
 #if !defined(WP2_ENC_DEC_MATCH)
@@ -95,6 +97,7 @@
   {  // Decode
     BitstreamFeatures features;
     features.rgb_bit_depth = WP2Formatbpc(src_format);
+    features.is_premultiplied = WP2IsPremultiplied(src_format);
     TilesLayout tiles;
     tiles.num_tiles_x = tiles.num_tiles_y = 1;
     tiles.tile_width = tiles.tile_height = width;
diff --git a/tests/test_conversion.cc b/tests/test_conversion.cc
index 8aa55e3..6300a75 100644
--- a/tests/test_conversion.cc
+++ b/tests/test_conversion.cc
@@ -14,11 +14,14 @@
 
 #include <algorithm>
 #include <cstdint>
+#include <numeric>
 #include <utility>
+#include <vector>
 
 #include "include/helpers.h"
 #include "src/dsp/dsp.h"
 #include "src/wp2/base.h"
+#include "gtest/gtest.h"
 
 namespace WP2 {
 namespace {
@@ -73,5 +76,69 @@
   }
 }
 
+TEST(RgbConversionTest, From32bTo64b) {
+  WP2ArgbConverterInit();
+
+  ArgbBuffer src(WP2_ARGB_32);
+  ASSERT_WP2_OK(src.Resize(256, 256));
+  for (uint32_t y = 0; y < src.height(); ++y) {
+    for (uint32_t x = 0; x < src.width(); ++x) {
+      src.GetRow8(y)[x * 4] = x;
+      src.GetRow8(y)[x * 4 + 1] = y;
+      src.GetRow8(y)[x * 4 + 2] = y;
+      src.GetRow8(y)[x * 4 + 3] = y;
+    }
+  }
+
+  ArgbBuffer dst(WP2_ARGB_64);
+  ASSERT_WP2_OK(dst.ConvertFrom(src));
+  ArgbBuffer roundtrip(WP2_ARGB_32);
+  ASSERT_WP2_OK(roundtrip.ConvertFrom(dst));
+  ASSERT_TRUE(testutil::Compare(src, roundtrip, "all_alpha_rgb_combinations"));
+}
+
+TEST(RgbConversionTest, From64bTo32b) {
+  WP2ArgbConverterInit();
+
+  // Try some alpha values but not all, otherwise it takes too long.
+  std::vector<uint16_t> alpha_values(550);
+  std::iota(alpha_values.begin(), alpha_values.end(), 0);
+  alpha_values.push_back(1000);
+  alpha_values.push_back(65533);
+  alpha_values.push_back(65534);
+  alpha_values.push_back(65535);
+
+  for (uint16_t alpha : alpha_values) {
+    ArgbBuffer src(WP2_ARGB_64);
+    ASSERT_WP2_OK(src.Resize(256, 256));
+    // Test all values of RGB.
+    for (uint32_t y = 0; y < src.height(); ++y) {
+      for (uint32_t x = 0; x < src.width(); ++x) {
+        src.GetRow16(y)[x * 4] = alpha;
+        src.GetRow16(y)[x * 4 + 1] = x;
+        src.GetRow16(y)[x * 4 + 2] = y;
+        src.GetRow16(y)[x * 4 + 3] = y * 256 + x;
+      }
+    }
+
+    ArgbBuffer dst(WP2_ARGB_32);
+    ASSERT_WP2_OK(dst.ConvertFrom(src));
+    ArgbBuffer roundtrip(WP2_ARGB_64);
+    ASSERT_WP2_OK(roundtrip.ConvertFrom(dst));
+    ASSERT_TRUE(testutil::Compare(src, roundtrip, "all_alpha_rgb_combinations",
+                                  /*expected_distortion=*/40.f));
+
+    for (uint32_t y = 0; y < src.height(); ++y) {
+      for (uint32_t x = 0; x < src.width(); ++x) {
+        for (uint32_t c = 0; c < 4; ++c) {
+          ASSERT_NEAR(src.GetRow16(y)[x * 4 + c],
+                      roundtrip.GetRow16(y)[x * 4 + c], 128)
+              << " (channel " << c << " at " << x << ", " << y << ")";
+        }
+      }
+    }
+  }
+}
+
 }  // namespace
 }  // namespace WP2
diff --git a/tests/test_formats.cc b/tests/test_formats.cc
index 9239ce1..0706804 100644
--- a/tests/test_formats.cc
+++ b/tests/test_formats.cc
@@ -17,10 +17,10 @@
 // premultiplied before encoding and the 'premul <-> unpremul <-> premul'
 // operation is stable.
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
-#include <ostream>
 #include <string>
 #include <tuple>
 #include <vector>
@@ -35,6 +35,7 @@
 #include "src/wp2/decode.h"
 #include "src/wp2/encode.h"
 #include "src/wp2/format_constants.h"
+#include "gtest/gtest.h"
 
 namespace WP2 {
 namespace {
@@ -312,7 +313,7 @@
     src_is_premul_ = std::get<1>(GetParam());
     tmp_is_premul_ = WP2IsPremultiplied(tmp_format);
     tmp_has_alpha_ = WP2FormatHasAlpha(tmp_format);
-    tmp_is_38b_ = (WP2Formatbpc(tmp_format) > 8);
+    tmp_is_38b_ = tmp_format == WP2_Argb_38;
     convert_to_tmp_ = src_is_premul_ ? WP2ArgbConvertTo[tmp_format]
                                      : WP2ARGBConvertTo[tmp_format];
     convert_from_tmp_ = src_is_premul_ ? WP2ArgbConvertFrom[tmp_format]
@@ -323,14 +324,20 @@
   // Returns the expected output of 'v' given alpha 'a' and input/temp formats.
   uint32_t GetExpectedValue(uint32_t a, uint32_t v) const {
     if (src_is_premul_) {
+      if (tmp_has_alpha_) return v;
+      if (tmp_is_38b_) return std::min(Unmult(a, v), 1023u);
       return (tmp_has_alpha_ ? v : Unmult(a, v));
     }
     if (tmp_is_premul_) {
       // Temp is premultiplied but not src so there is some loss.
       if (tmp_is_38b_) {
-        return RightShiftRound(Unmult(a, DivBy255(a * LeftShift(v, 2))), 2);
+        const uint32_t v_38b = (v * 1023 + 127) / 255;
+        const uint32_t v_premul = (a * v_38b + 127) / 255;
+        const uint32_t v_unmul = std::min(Unmult(a, v_premul), 1023u);
+        const uint32_t v_32b = (v_unmul * 255 + 511) / 1023;
+        return v_32b;
       }
-      return Unmult(a, DivBy255(a * v));
+      return Unmult(a, (a * v + 127) / 255);
     }
     return v;
   }
@@ -375,8 +382,7 @@
     ExhaustiveTestInstantiation, ExhaustiveTest,
     testing::Combine(testing::ValuesIn(testutil::kWP2CpuInfoStructs),
                      testing::Values(false, true),  // src_is_premul_
-                     // TODO(yguyon): Include WP2_Argb_38 when ready
-                     testing::Range(WP2_Argb_32, WP2_Argb_38)  // tmp_format
+                     testing::Range(WP2_Argb_32, WP2_FORMAT_NUM)  // tmp_format
                      ));
 
 //------------------------------------------------------------------------------
@@ -402,9 +408,6 @@
 };
 
 typedef std::tuple<FmtTestCase, testutil::WP2CPUInfoStruct> SpeedTestParam;
-static void PrintTo(const SpeedTestParam& p, std::ostream* os) {
-  *os << "{" << std::get<0>(p).name << ", " << std::get<1>(p).name << "}";
-}
 
 class SpeedTest : public testing::TestWithParam<SpeedTestParam> {
   void SetUp() override {
diff --git a/tests/test_imageio_16b.cc b/tests/test_imageio_16b.cc
new file mode 100644
index 0000000..4ca729d
--- /dev/null
+++ b/tests/test_imageio_16b.cc
@@ -0,0 +1,59 @@
+// Copyright 2024 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "imageio/image_dec.h"
+#include "imageio/image_enc.h"
+#include "include/helpers.h"
+#include "src/wp2/base.h"
+#include "gtest/gtest.h"
+
+namespace WP2 {
+namespace {
+
+TEST(ReadWriteRead16bImageTest, Opaque) {
+  for (const std::string prefix : {"gray", "rgb"}) {
+    ArgbBuffer buffer(WP2_ARGB_64);
+    ASSERT_WP2_OK(ReadImage(
+        testutil::GetTestDataPath(prefix + "16.png").c_str(), &buffer));
+    EXPECT_FALSE(buffer.HasTransparency());
+
+    const std::string dst =
+        testutil::GetTempDataPath("test_imageio_16b_" + prefix + "16.png");
+    ASSERT_WP2_OK(SaveImage(buffer, dst.c_str()));
+    ArgbBuffer written(WP2_ARGB_64);
+    ASSERT_WP2_OK(ReadImage(dst.c_str(), &written));
+    EXPECT_TRUE(testutil::Compare(buffer, written, dst.c_str()));
+  }
+}
+
+TEST(ReadWriteRead16bImageTest, Translucent) {
+  for (const std::string prefix : {"grayalpha", "rgba"}) {
+    ArgbBuffer buffer(WP2_ARGB_64);
+    ASSERT_WP2_OK(ReadImage(
+        testutil::GetTestDataPath(prefix + "16.png").c_str(), &buffer));
+    EXPECT_TRUE(buffer.HasTransparency());
+
+    const std::string dst =
+        testutil::GetTempDataPath("test_imageio_16b_" + prefix + "16.png");
+    ASSERT_WP2_OK(SaveImage(buffer, dst.c_str()));
+    ArgbBuffer written(WP2_ARGB_64);
+    ASSERT_WP2_OK(ReadImage(dst.c_str(), &written));
+    EXPECT_TRUE(testutil::Compare(buffer, written, dst.c_str()));
+  }
+}
+
+}  // namespace
+}  // namespace WP2
diff --git a/tests/test_imageio_conversion.cc b/tests/test_imageio_conversion.cc
index d7a5320..e49e0ad 100644
--- a/tests/test_imageio_conversion.cc
+++ b/tests/test_imageio_conversion.cc
@@ -29,6 +29,7 @@
 #include "src/utils/plane.h"
 #include "src/utils/utils.h"
 #include "src/wp2/base.h"
+#include "gtest/gtest.h"
 
 namespace WP2 {
 namespace {
@@ -292,7 +293,36 @@
 
   // 10-bit is unsupported.
   EXPECT_EQ(TestChannelOrder("source1.png", WP2_Argb_38),
-            WP2_STATUS_INVALID_COLORSPACE);
+            WP2_STATUS_INVALID_PARAMETER);
+
+  // Reading a 16-bit image in a 8-bit buffer is forbidden.
+  EXPECT_EQ(TestChannelOrder("basi0g16.png", WP2_Argb_38),
+            WP2_STATUS_INVALID_PARAMETER);
+}
+
+//------------------------------------------------------------------------------
+
+TEST(ReadWriteRead16bImageTest, LossDueToLowerBitDepth) {
+  for (const std::string prefix : {"gray", "grayalpha", "rgb", "rgba"}) {
+    const std::string path16 = testutil::GetTestDataPath(prefix + "16.png");
+    ArgbBuffer original16(WP2_ARGB_64);
+    ASSERT_WP2_OK(ReadImage(path16.c_str(), &original16));
+
+    // Introduce some loss.
+    ArgbBuffer converted8(WP2_ARGB_32);
+    ASSERT_WP2_OK(converted8.ConvertFrom(original16));
+    ArgbBuffer converted16(original16.format());
+    ASSERT_WP2_OK(converted16.ConvertFrom(converted8));
+
+    EXPECT_TRUE(testutil::Compare(original16, converted16, path16,
+                                  /*expected_distortion=*/40.f));
+
+    // Compare with 8-bit source.
+    const std::string path8 = testutil::GetTestDataPath(prefix + "8.png");
+    ArgbBuffer original8(WP2_ARGB_32);
+    ASSERT_WP2_OK(ReadImage(path8.c_str(), &original8));
+    EXPECT_TRUE(testutil::Compare(original8, converted8, path8));
+  }
 }
 
 //------------------------------------------------------------------------------
diff --git a/tests/test_slow_lossless_enc.cc b/tests/test_slow_lossless_enc.cc
index ae52e0d..78b6773 100644
--- a/tests/test_slow_lossless_enc.cc
+++ b/tests/test_slow_lossless_enc.cc
@@ -22,6 +22,7 @@
 #include "src/wp2/base.h"
 #include "src/wp2/decode.h"
 #include "src/wp2/encode.h"
+#include "gtest/gtest.h"
 
 namespace WP2 {
 namespace {
diff --git a/tests/testdata/gray16.png b/tests/testdata/gray16.png
new file mode 100644
index 0000000..9cf9751
--- /dev/null
+++ b/tests/testdata/gray16.png
Binary files differ
diff --git a/tests/testdata/gray8.png b/tests/testdata/gray8.png
new file mode 100644
index 0000000..62f8f44
--- /dev/null
+++ b/tests/testdata/gray8.png
Binary files differ
diff --git a/tests/testdata/grayalpha16.png b/tests/testdata/grayalpha16.png
new file mode 100644
index 0000000..db73055
--- /dev/null
+++ b/tests/testdata/grayalpha16.png
Binary files differ
diff --git a/tests/testdata/grayalpha8.png b/tests/testdata/grayalpha8.png
new file mode 100644
index 0000000..126085c
--- /dev/null
+++ b/tests/testdata/grayalpha8.png
Binary files differ
diff --git a/tests/testdata/rgb16.png b/tests/testdata/rgb16.png
new file mode 100644
index 0000000..9a360d7
--- /dev/null
+++ b/tests/testdata/rgb16.png
Binary files differ
diff --git a/tests/testdata/rgb8.png b/tests/testdata/rgb8.png
new file mode 100644
index 0000000..34c226c
--- /dev/null
+++ b/tests/testdata/rgb8.png
Binary files differ
diff --git a/tests/testdata/rgba16.png b/tests/testdata/rgba16.png
new file mode 100644
index 0000000..20d4e19
--- /dev/null
+++ b/tests/testdata/rgba16.png
Binary files differ
diff --git a/tests/testdata/rgba8.png b/tests/testdata/rgba8.png
new file mode 100644
index 0000000..2216418
--- /dev/null
+++ b/tests/testdata/rgba8.png
Binary files differ