src/enc/block_enc.cc - codecs/libwebp2 - Git at Google

 // Copyright 2021 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // -----------------------------------------------------------------------------
 //
 //   CodedBlock, for encoding
 //
 // Author: Skal (pascal.massimino@gmail.com)

 #include "src/enc/block_enc.h"

 #include <algorithm>
 #include <cassert>

 #include "src/common/lossy/block.h"
 #include "src/common/lossy/predictor.h"
 #include "src/common/lossy/transforms.h"
 #include "src/common/symbols.h"
 #include "src/enc/trellis.h"
 #include "src/enc/wp2_enc_i.h"
 #include "src/utils/front_mgr.h"
 #include "src/utils/utils.h"
 #include "src/wp2/format_constants.h"

 namespace WP2 {

 //------------------------------------------------------------------------------
 // Counters

 WP2Status Counters::Init(uint32_t effort, bool use_aom_residuals,
                          const SymbolRecorder& recorder) {
   effort_ = effort;
   use_aom_residuals_ = use_aom_residuals;
   recorder_ = &recorder;

   predictor_.reset(new (WP2Allocable::nothrow) SymbolCounter(recorder_));
   WP2_CHECK_ALLOC_OK(predictor_ != nullptr);
   WP2_CHECK_STATUS(
       predictor_->Allocate({kSymbolModeY, kSymbolModeUV, kSymbolModeA}));

   transform_.reset(new (WP2Allocable::nothrow) SymbolCounter(recorder_));
   WP2_CHECK_ALLOC_OK(transform_ != nullptr);
   WP2_CHECK_STATUS(transform_->Allocate(
       {kSymbolHasCoeffs, kSymbolTransform, kSymbolSplitTransform}));

   segment_id_.reset(new (WP2Allocable::nothrow) SymbolCounter(recorder_));
   WP2_CHECK_ALLOC_OK(segment_id_ != nullptr);
   WP2_CHECK_STATUS(segment_id_->Allocate({kSymbolSegmentId}));

   if (effort >= kSlowCounterEffortThreshold) {
     // Using the UpdatingSymbolCounter slows down encoding by ~9%
     residuals_slow_.reset(new (WP2Allocable::nothrow)
                               UpdatingSymbolCounter(recorder_));
     WP2_CHECK_ALLOC_OK(residuals_slow_ != nullptr);
     residuals_ = residuals_slow_.get();
   } else {
     residuals_fast_.reset(new (WP2Allocable::nothrow) SymbolCounter(recorder_));
     WP2_CHECK_ALLOC_OK(residuals_fast_ != nullptr);
     residuals_ = residuals_fast_.get();
   }
   if (use_aom_residuals) {
     WP2_CHECK_STATUS(residuals()->Allocate(
         {kSymbolTransform, kAOMEOBPT4, kAOMEOBPT8, kAOMEOBPT16, kAOMEOBPT32,
          kAOMEOBPT64, kAOMEOBPT128, kAOMEOBPT256, kAOMEOBPT512, kAOMEOBPT1024,
          kAOMEOBExtra, kAOMCoeffBaseEOB, kAOMCoeffBase, kAOMCoeffBaseRange}));
   } else {
     WP2_CHECK_STATUS(residuals()->Allocate(
         {kSymbolDC, kSymbolResidualUseBounds, kSymbolResidualBound1IsX,
          kSymbolResidualUseBound2, kSymbolResidualIsZero, kSymbolResidualIsOne,
          kSymbolResidualIsTwo, kSymbolResidualEndOfBlock,
          kSymbolResidualHasOnlyOnesLeft}));
   }
   return WP2_STATUS_OK;
 }

 WP2Status Counters::CopyFrom(const Counters& other,
                              const SymbolRecorder& recorder) {
   if (&recorder != recorder_) {
     WP2_CHECK_STATUS(Init(other.effort_, other.use_aom_residuals_, recorder));
   } else {
     // Init() was already called.
     assert(transform_ != nullptr && residuals_ != nullptr &&
            predictor_ != nullptr);
   }
   if (residuals_slow_ != nullptr) {
     WP2_CHECK_STATUS(residuals_slow_->CopyFrom(*other.residuals_slow_));
   }
   return WP2_STATUS_OK;
 }

 //------------------------------------------------------------------------------
 // CodedBlock  (encoder)

 void CodedBlock::SetSrcInput(const YUVPlane& in) {
   WP2_ASSERT_STATUS(in_.SetView(in, AsRect()));
 }

 //------------------------------------------------------------------------------

 void CodedBlock::ExtractFrom(const YUVPlane& yuv, Channel channel) const {
   Plane16 src, dst;
   WP2_ASSERT_STATUS(src.SetView(yuv.GetChannel(channel), AsRect()));
   WP2_ASSERT_STATUS(dst.SetView(out_.GetChannel(channel), AsRect()));
   WP2_ASSERT_STATUS(dst.Copy(src, /*resize_if_needed=*/false));
 }

 uint64_t CodedBlock::GetDisto(Channel channel) const {
   const Plane16& p_in = in_.GetChannel(channel);
   const Plane16& p_out = out_.GetChannel(channel);
   // TODO(skal): experiment calling WP2SumSquaredErrorHalfBlock for U/V channel
   return WP2SumSquaredErrorBlock(p_in.Row(0), p_in.Step(), p_out.Row(0),
                                  p_out.Step(), w_pix(), h_pix());
 }

 void CodedBlock::GetResiduals(Channel channel, uint32_t tf_i,
                               const Plane16& prediction,
                               BlockCoeffs16* const res) const {
   const Plane16& in = in_.GetChannel(channel);

   const Rectangle tf_rect =
       blk().LocalTfRect(GetCodingParams(channel).split_tf, tf_i);
   SubtractBlockFunc sub_block = WP2::SubtractBlock[TrfLog2[tf_rect.width] - 2];

   sub_block(&in.At(tf_rect.x, tf_rect.y), in.Step(),
             &prediction.At(tf_rect.x, tf_rect.y), prediction.Step(),
             (*res)[tf_i], tf_rect.width, tf_rect.height);

 #if defined(WP2_BITTRACE)
   if (original_res_ != nullptr) {
     memcpy((*original_res_)[channel][tf_i], &(*res)[tf_i][0],
            tf_rect.width * tf_rect.height * sizeof((*res)[tf_i][0]));
   }
 #endif
 }

 void CodedBlock::TransformAndReconstruct(
     const EncoderConfig& config, const BlockContext& context,
     const Segment& segment, Channel channel, uint32_t tf_i,
     bool reduced_transform, const Plane16& prediction,
     const BlockCoeffs16* const res, BlockCoeffs16* const tmp,
     Counters* const counters) {
   assert(tf_i < GetNumTransforms(channel));
   const CodingParams* const params = GetCodingParams(channel);
   const QuantMtx& quant = segment.GetQuant(channel);
   BlockCoeffs16& dequantized = *tmp;

   const BlockSize split_size =
       GetSplitSize(dim(), GetCodingParams(channel)->split_tf);
   const uint32_t split_w = BlockWidthPix(split_size);
   const uint32_t split_h = BlockHeightPix(split_size);

   int32_t tf_res[kMaxBlockSizePix2];
   WP2Transform2D((*res)[tf_i], params->tf_x(), params->tf_y(), split_w, split_h,
                  tf_res, reduced_transform);
   if (channel == kUChannel || channel == kVChannel) {
     assert(params->tf == kDctDct && !params->split_tf);
     // Note DC is scaled by NumPix(dim) by WP2Transform2D. We don't
     // normalize it when propagating the error, even though the same value
     // will have a bigger impact on a smaller block than on a large one.
     // This is intentional: since we have fewer pixels in a smaller block,
     // we shift them more. But it would also be possible to normalize by a
     // factor of NumPix(dim) (multiply dc_error_ by this and devide
     // dc_error_next by the same factor). This would mean we apply the same
     // color shift regardless of the block size.
     tf_res[0] += dc_error_[channel];
     dc_error_next_[channel] = quant.DCError(tf_res[0], tdim(channel));
   }
   // Use some kind of coeff optimization (rd-opt based on av1 coeffs, or simple
   // dropout).
   const bool use_coeff_optim =
       (params->tf != kIdentityIdentity && config.effort >= 7);
   const bool use_av1_coeff_optim = (use_coeff_optim && context.use_aom() &&
                                     channel == kYChannel && config.effort >= 8);
   // Av1CoeffOptimization works better if quantization uses simple rounding
   // instead of biasing towards 0.
   const bool use_bias = !use_av1_coeff_optim;
   quant.Quantize(tf_res, tdim(channel), IsFirstCoeffDC(channel),
                  coeffs_[channel][tf_i], &num_coeffs_[channel][tf_i],
                  dequantized[tf_i], use_bias);

   if (use_coeff_optim) {
     if (use_av1_coeff_optim) {
       Av1CoeffOptimization(
           channel, params->tf, tdim(channel), quant, IsFirstCoeffDC(channel),
           tf_res, coeffs_[channel][tf_i], &num_coeffs_[channel][tf_i],
           dequantized[tf_i], counters->residuals(), /*rate_cost=*/nullptr);
     } else {
       DropoutCoeffs(tdim(channel), segment.quality_factor_,
                     coeffs_[channel][tf_i], &num_coeffs_[channel][tf_i],
                     dequantized[tf_i]);
     }
   }

   if (channel == kYChannel && mtx_set_ != nullptr) {
     mtx_set_->DecideUseRndMtx(this);  // TODO(skal): later: U/V channel?
   }

   Reconstruct(channel, tf_i, reduced_transform, &dequantized, prediction);
 }

 void CodedBlock::Reconstruct(Channel channel, uint32_t tf_i,
                              bool reduced_transform, BlockCoeffs16* const res,
                              const Plane16& prediction) const {
   assert(tf_i < GetNumTransforms(channel));
   const CodingParams& params = GetCodingParams(channel);
   const Rectangle tf_rect = blk().LocalTfRect(params.split_tf, tf_i);

   WP2InvTransform2D((*res)[tf_i], params.tf_x(), params.tf_y(), tf_rect.width,
                     tf_rect.height, reduced_transform);

   Plane16* const out = &out_.GetChannel(channel);
   Plane16 pred_view;
   if (prediction.IsEmpty()) {
     WP2_ASSERT_STATUS(pred_view.SetView(*out));
     PredictBlock(channel, tf_i, pred_view.Row(0), pred_view.Step());
   } else {
     WP2_ASSERT_STATUS(pred_view.SetView(prediction));
   }

   AddBlockFunc add_block = WP2::AddBlock[TrfLog2[tf_rect.width] - 2];
   const int32_t min = (channel == kAChannel) ? 0 : yuv_min_;
   const int32_t max = (channel == kAChannel) ? kAlphaMax : yuv_max_;

   add_block(&pred_view.At(tf_rect.x, tf_rect.y), pred_view.Step(), (*res)[tf_i],
             tf_rect.width, min, max, &out->At(tf_rect.x, tf_rect.y),
             out->Step(), tf_rect.height);

   if (params.split_tf && tf_i < 3) {
     context_cache_->Reset(/*only_small_right_or_bot_contexts=*/true);
   }
 }

 void CodedBlock::QuantizeAll(const EncoderConfig& config,
                              const BlockContext& context,
                              const Segment& segment, Channel channel,
                              bool reduced_transform, Counters* const counters) {
   for (uint32_t tf_i = 0; tf_i < GetNumTransforms(channel); ++tf_i) {
     Quantize(config, context, segment, channel, tf_i, reduced_transform,
              counters);
   }
 }

 void CodedBlock::QuantizeAllButFirst(const EncoderConfig& config,
                                      const BlockContext& context,
                                      const Segment& segment, Channel channel,
                                      bool reduced_transform,
                                      Counters* const counters) {
   for (uint32_t tf_i = 1; tf_i < GetNumTransforms(channel); ++tf_i) {
     Quantize(config, context, segment, channel, tf_i, reduced_transform,
              counters);
   }
 }

 void CodedBlock::Quantize(const EncoderConfig& config,
                           const BlockContext& context, const Segment& segment,
                           Channel channel, uint32_t tf_i,
                           bool reduced_transform, Counters* const counters) {
   BlockCoeffs16 res, tmp;
   Plane16* const out = &out_.GetChannel(channel);
   PredictBlock(channel, tf_i, out->Row(0), out->Step());
   GetResiduals(channel, tf_i, /*prediction=*/*out, &res);
   TransformAndReconstruct(config, context, segment, channel, tf_i,
                           reduced_transform,
                           /*prediction=*/*out, &res, &tmp, counters);
 }

 //------------------------------------------------------------------------------

 float CodedBlock::ResidualRate(const BlockContext& context, Channel channel,
                                uint32_t num_channels,
                                SymbolCounter* const counter) const {
   if (context.use_aom()) {
     return ResidualWriter::GetRateAOM(*this, channel, counter);
   }
   // Whether to use the more accurate but a lot more expensive rate.
   // Even this one is not perfect because the cost depends on global
   // distributions which are not known at this time.
   static constexpr bool kUseExpensiveRate = false;
   float rate = 0.f;
   for (uint32_t tf_i = 0; tf_i < GetNumTransforms(channel); ++tf_i) {
     if (kUseExpensiveRate) {
       rate += ResidualWriter::GetRate(
           channel, num_channels, tdim(channel), coeffs_[channel][tf_i],
           num_coeffs_[channel][tf_i], IsFirstCoeffDC(channel), counter);
     } else {
       rate += ResidualWriter::GetPseudoRate(
           channel, num_channels, tdim(channel), coeffs_[channel][tf_i],
           num_coeffs_[channel][tf_i], IsFirstCoeffDC(channel), counter);
     }
   }
   return rate;
 }

 // Quantizes 'coeffs', then dequantizes, detransforms them and computes
 // distortion compared to 'res'.
 static uint32_t RoundTripDisto(CodedBlock* const cb, const QuantMtx& quant,
                                Channel channel, TrfSize tdim,
                                const int16_t* const res,
                                const int32_t* const coeffs, bool reduced,
                                int16_t* const quantized_coeffs,
                                uint32_t* const num_coeffs) {
   int16_t dequantized_coeffs[kMaxBlockSizePix2];
   quant.Quantize(coeffs, tdim, cb->IsFirstCoeffDC(channel), quantized_coeffs,
                  num_coeffs, dequantized_coeffs);
   const CodedBlock::CodingParams& params = *cb->GetCodingParams(channel);
   WP2InvTransform2D(dequantized_coeffs, params.tf_x(), params.tf_y(),
                     cb->w_pix(), cb->h_pix(), reduced);
   return WP2SumSquaredError16s(res, dequantized_coeffs, *num_coeffs);
 }

 // Computes the rate-distortion score for the U and V channels.
 // 'res_u' and 'res_v' should be in frequency space. 'orig_u' and 'orig_v' are
 // the original spatial residuals to compare against for distortion.
 static WP2Status UVScore(const BlockContext& context,
                          const EncoderConfig& config, uint32_t tile_pos_x,
                          uint32_t tile_pos_y, bool has_alpha,
                          const QuantMtx& quant_u, const QuantMtx& quant_v,
                          const int16_t orig_u[kMaxBlockSizePix2],
                          const int16_t orig_v[kMaxBlockSizePix2],
                          const int32_t res_u[kMaxBlockSizePix2],
                          const int32_t res_v[kMaxBlockSizePix2], TrfSize dim,
                          bool reduced, SymbolCounter* const counter,
                          CodedBlock* const cb, float* score) {
   const float lambda_u = quant_u.lambda * 2.f;  // Another magic constant.
   const float lambda_v = quant_v.lambda * 2.f;

   uint32_t tmp_num_coeffs_u, tmp_num_coeffs_v;
   int16_t tmp_coeffs_u[kMaxBlockSizePix2];
   int16_t tmp_coeffs_v[kMaxBlockSizePix2];
   const uint32_t disto =
       RoundTripDisto(cb, quant_u, kUChannel, dim, orig_u, res_u, reduced,
                      tmp_coeffs_u, &tmp_num_coeffs_u) +
       RoundTripDisto(cb, quant_v, kVChannel, dim, orig_v, res_v, reduced,
                      tmp_coeffs_v, &tmp_num_coeffs_v);
   // early out if there's ~only DC
   if (!reduced && tmp_num_coeffs_u <= 1 && tmp_num_coeffs_v <= 1) {
     WP2_CHECK_REDUCED_STATUS(
         cb->Store420Scores(config, tile_pos_x, tile_pos_y, lambda_u, lambda_v,
                            reduced, disto, 0, 0));
     *score = 0;
     return WP2_STATUS_OK;
   }
   const uint32_t num_channels = (has_alpha ? 4 : 3);
   const float rate_u =
       cb->ResidualRate(context, kUChannel, num_channels, counter);
   const float rate_v =
       cb->ResidualRate(context, kVChannel, num_channels, counter);
   *score = disto + lambda_u * rate_u + lambda_v * rate_v;
   WP2_CHECK_REDUCED_STATUS(cb->Store420Scores(config, tile_pos_x, tile_pos_y,
                                               lambda_u, lambda_v, reduced,
                                               disto, rate_u, rate_v));
   return WP2_STATUS_OK;
 }

 WP2Status CodedBlock::DecideChromaSubsampling(
     const EncoderConfig& config, const BlockContext& context,
     uint32_t tile_pos_x, uint32_t tile_pos_y, bool has_alpha,
     const QuantMtx& quant_u, const QuantMtx& quant_v,
     Counters* const counters) {
   assert(!GetCodingParams(kUChannel)->split_tf);
   assert(!GetCodingParams(kVChannel)->split_tf);
   const BlockSize dim = blk_.dim();

   const TrfSize full_dim = kFullDim[dim];
   const TrfSize half_dim = kHalfDim[dim];
   const uint32_t w_pix = blk_.w_pix();
   const uint32_t h_pix = blk_.h_pix();
   SubtractBlockFunc sub_block = WP2::SubtractBlock[TrfLog2[w_pix] - 2];

   const CodingParams* const u_params = GetCodingParams(kUChannel);
   const CodingParams* const v_params = GetCodingParams(kVChannel);

   int16_t res_u[kMaxBlockSizePix2], res_v[kMaxBlockSizePix2];
   {
     int16_t prediction[kMaxBlockSizePix2];
     u_params->pred->Predict(*this, kUChannel, /*split_tf=*/false, /*tf_i=*/0,
                             prediction, w_pix);
     sub_block(in_.U.Row(0), in_.U.Step(), prediction, /*preds_step=*/w_pix,
               res_u, /*dst_step=*/w_pix, h_pix);
     v_params->pred->Predict(*this, kVChannel, /*split_tf=*/false, /*tf_i=*/0,
                             prediction, w_pix);
     sub_block(in_.V.Row(0), in_.V.Step(), prediction, /*preds_step=*/w_pix,
               res_v, /*dst_step=*/w_pix, h_pix);
   }
   int16_t orig_u[kMaxBlockSizePix2], orig_v[kMaxBlockSizePix2];
   std::copy(res_u, res_u + w_pix * h_pix, orig_u);
   std::copy(res_v, res_v + w_pix * h_pix, orig_v);

   int32_t tmp_u[kMaxBlockSizePix2], tmp_v[kMaxBlockSizePix2];
   // transform at full resolution
   WP2Transform2D(res_u, u_params->tf_x(), u_params->tf_y(), w_pix, h_pix,
                  tmp_u);
   WP2Transform2D(res_v, v_params->tf_x(), v_params->tf_y(), w_pix, h_pix,
                  tmp_v);

   float score444;
   WP2_CHECK_STATUS(UVScore(context, config, tile_pos_x, tile_pos_y, has_alpha,
                            quant_u, quant_v, orig_u, orig_v, tmp_u, tmp_v,
                            full_dim, /*reduced=*/false, counters->residuals(),
                            this, &score444));
   if (score444 == 0) {
     is420_ = false;
     WP2_CHECK_REDUCED_STATUS(Store420Decision(config, tile_pos_x, tile_pos_y,
                                               Debug420Decision::k444EarlyExit));
     return WP2_STATUS_OK;
   }

   // inspect half-resolution now
   WP2ReduceCoeffs(tmp_u, w_pix, h_pix, tmp_v);
   WP2ReduceCoeffs(tmp_u, w_pix, h_pix, tmp_v);
   float score420;
   WP2_CHECK_STATUS(UVScore(context, config, tile_pos_x, tile_pos_y, has_alpha,
                            quant_u, quant_v, orig_u, orig_v, tmp_u, tmp_v,
                            half_dim, /*reduced=*/true, counters->residuals(),
                            this, &score420));

   is420_ = (score420 < score444);
   WP2_CHECK_REDUCED_STATUS(Store420Decision(
       config, tile_pos_x, tile_pos_y,
       is420_ ? Debug420Decision::k420 : Debug420Decision::k444));
   return WP2_STATUS_OK;
 }

 //------------------------------------------------------------------------------

 float CodedBlock::PredictorRate(Channel channel,
                                 SymbolCounter* const counter) const {
   counter->Clear();
   ANSEncCounter enc;
   SyntaxWriter::WritePredictors(*this, channel, counter, &enc);
   return enc.GetCost();
 }

 float CodedBlock::TransformRate(Channel channel,
                                 SymbolCounter* const counter) const {
   counter->Clear();
   ANSEncCounter enc;
   SyntaxWriter::WriteSplitTransform(*this, channel, counter, &enc);
   SyntaxWriter::WriteHasCoeffs(*this, channel, counter, &enc);
   SyntaxWriter::WriteTransform(*this, channel, counter, &enc);
   return enc.GetCost();
 }

 float CodedBlock::SegmentIdRate(const BlockContext& context,
                                 SymbolCounter* const counter) const {
   counter->Clear();
   ANSEncCounter enc;
   context.segment_id_predictor().WriteId(*this, counter, &enc);
   return enc.GetCost();
 }

 //------------------------------------------------------------------------------

 namespace {

 // Returns the index of 'channel' in one of the following sets:
 //   {luma Y}, {chroma U, chroma V} or {alpha}
 uint32_t GetChannelIndex(Channel channel) {
   return (channel == kVChannel) ? 1 : 0;
 }

 }  // namespace

 float BlockRates::GetScore() const {
   return distortion +
          lambda * ((predictor_rate.is_defined ? predictor_rate.rate : 0.f) +
                    (transform_rate.is_defined ? transform_rate.rate : 0.f) +
                    (segment_id_rate.is_defined ? segment_id_rate.rate : 0.f) +
                    (residuals_rate.is_defined ? residuals_rate.rate : 0.f));
 }

 bool BlockRates::HasSameScoreFormulaAs(const BlockRates& other) const {
   return predictor_rate.is_defined == other.predictor_rate.is_defined &&
          transform_rate.is_defined == other.transform_rate.is_defined &&
          segment_id_rate.is_defined == other.segment_id_rate.is_defined &&
          residuals_rate.is_defined == other.residuals_rate.is_defined &&
          lambda == other.lambda;
 }

 float BlockScore::GetScore() const {
   return results[0].GetScore() + results[1].GetScore();
 }

 float BlockScore::GetDistortion() const {
   return results[0].distortion + results[1].distortion;
 }

 bool BlockScore::IsBetterThan(const BlockScore& other) const {
   // Make sure the scores are comparable.
   assert(results[0].HasSameScoreFormulaAs(other.results[0]));
   assert(results[1].HasSameScoreFormulaAs(other.results[1]));
   return (GetScore() < other.GetScore());
 }

 //------------------------------------------------------------------------------

 WP2Status BlockScorer::Init(const EncoderConfig& config,
                             const GlobalParams& gparams, Rectangle tile_rect) {
   config_ = &config;
   gparams_ = &gparams;
   tile_rect_ = tile_rect;

   WP2_CHECK_ALLOC_OK(pred_mode_map_.resize(
       std::max({kYPredModeNum, kUVPredModeNum, kAPredModeNum})));
   return WP2_STATUS_OK;
 }

 void BlockScorer::Set(const BlockContext& context, CodedBlock* cb,
                       Counters* counters) {
   cb_ = cb;
   context_ = &context;
   counters_ = counters;
   // Use the segment given by AssignSegmentId() to compute the lambdas because
   // we may end up comparing several 'segment_ids_' which would lead to
   // different score computation formulas otherwise.
   for (Channel channel : {kYChannel, kUChannel, kVChannel, kAChannel}) {
     const Segment& segment =
         gparams_->segments_[(channel == kAChannel) ? 0 : cb_->id_];
     lambdas_[channel] = cb_->lambda_mult_ * segment.GetQuant(channel).lambda;
   }

   cached_predictor_ = nullptr;
   last_params_ = CodedBlock::CodingParams();
   last_segment_id_ = 0;

   best_combination_ = BlockScore();
   best_combination_no_angle_ = BlockScore();
   best_combinations_angle_.clear();
 }

 WP2Status BlockScorer::ComputeScore() {
   CodedBlock::CodingParams& params = *cb_->GetCodingParams(channels_.front());
   const bool can_split =
       (channels_.front() == kYChannel &&
        GetSplitSize(cb_->dim(), /*split=*/true) != cb_->dim());
   const CodedBlock::SplitTf forced_split =
       can_split ? cb_->GetForcedSplitTf(*config_, tile_rect_)
                 : CodedBlock::SplitTf::kUnknown;

   if (forced_split != CodedBlock::SplitTf::kUnknown) {
     // Set by EncoderConfig::info (debug).
     params.split_tf = (forced_split == CodedBlock::SplitTf::kForcedSplit);
     WP2_CHECK_STATUS(ComputeScoreForEachPredictor(/*forced=*/true));
   } else {
     for (const bool split_tf : splits_) {
       if (split_tf && !can_split) continue;
       params.split_tf = split_tf;
       WP2_CHECK_STATUS(ComputeScoreForEachPredictor(/*forced=*/false));
     }
   }

   assert(best_combination_.params.pred != nullptr);
   return WP2_STATUS_OK;
 }

 WP2Status BlockScorer::ComputeScoreForEachPredictor(bool forced) {
   const Predictors& preds = gparams_->GetPredictors(channels_.front());
   CodedBlock::CodingParams& params = *cb_->GetCodingParams(channels_.front());
   const Predictor* const forced_pred =
       cb_->GetForcedPredictor(*config_, tile_rect_, preds, channels_.front());

   if (channels_.front() == kYChannel && cb_->y_context_is_constant_) {
     // If the context is constant, all luma predictors will predict the same
     // values (plus or minus 1, because of rounding errors), so we just force
     // the predictor to 0 and we do not write it to the bitstream. We cannot
     // do the same for UV and alpha because the chroma from luma predictor
     // does not have this property, but we could send a single bit saying
     // whether the predictor is cfl or "other".
     params.pred = preds.GetPred(0);  // Assumes it is the DC predictor.
     WP2_CHECK_STATUS(ComputeScoreForEachTransform(forced));
   } else if (forced_pred != nullptr) {
     // Predictor is set by EncoderConfig::info (debug).
     params.pred = forced_pred;
     WP2_CHECK_STATUS(ComputeScoreForEachTransform(/*forced=*/true));
   } else {
     for (const Predictor* pred : predictors_) {
       params.pred = pred;
       WP2_CHECK_STATUS(ComputeScoreForEachTransform(forced));
     }
   }
   return WP2_STATUS_OK;
 }

 WP2Status BlockScorer::ComputeScoreForEachTransform(bool forced) {
   CodedBlock::CodingParams& params = *cb_->GetCodingParams(channels_.front());
   for (Channel c : channels_) {
     if (!params.pred->ComputeParams(cb_, c)) {
       // TODO(skal): deal with useless preds
     }
   }
   const TransformPair forced_tf = cb_->GetForcedTransform(*config_, tile_rect_);

   if (forced_tf != kUnknownTf) {
     // Transform is set by EncoderConfig::info (debug).
     params.tf = forced_tf;
     WP2_CHECK_STATUS(ComputeScoreForEachSegment(/*forced=*/true));
   } else {
     for (const TransformPair tf : transforms_) {
       params.tf = tf;
       WP2_CHECK_STATUS(ComputeScoreForEachSegment(forced));
     }
   }
   return WP2_STATUS_OK;
 }

 WP2Status BlockScorer::ComputeScoreForEachSegment(bool forced) {
   if (segment_ids_.empty()) {
     // Just use the 'cb_->id_' value which was set by AssignSegmentId().
     WP2_CHECK_STATUS(ComputeScoreForEachChannel(forced));
   } else {
     for (uint8_t segment_id : segment_ids_) {
       cb_->id_ = segment_id;
       WP2_CHECK_STATUS(ComputeScoreForEachChannel(forced));
     }
   }
   return WP2_STATUS_OK;
 }

 WP2Status BlockScorer::ComputeScoreForEachChannel(bool forced) {
   BlockScore combination;
   combination.params = *cb_->GetCodingParams(channels_.front());
   combination.segment_id = cb_->id_;
   if (!forced) {
     // Each unique combination should be tried only once.
     assert(combination.params != last_params_ ||
            combination.segment_id != last_segment_id_);
     assert(combination.params != best_combination_.params ||
            combination.segment_id != best_combination_.segment_id);
   }

   // Recompute the prediction of the top left sub-block if needed.
   // TODO(yguyon): See if the prediction of !split can be reused for split.
   const bool recompute_top_left_residuals =
       (cached_predictor_ != combination.params.pred ||
        cached_split_tf_ != combination.params.split_tf);
   const uint32_t num_channels = (gparams_->maybe_use_lossy_alpha_ ? 4 : 3);
   const bool reduced_tf = (channels_.front() == kUChannel && cb_->is420_);
   const Segment& segment =
       gparams_->segments_[(channels_.front() == kAChannel) ? 0 : cb_->id_];

   for (Channel c : channels_) {
     const uint32_t ci = GetChannelIndex(c);
     BlockRates& results = combination.results[ci];
     results.lambda = lambdas_[c];

     Plane16 prediction;
     WP2_ASSERT_STATUS(prediction.SetView(prediction_cache_[ci][0],
                                          kMaxBlockSizePix, kMaxBlockSizePix,
                                          kMaxBlockSizePix));
     BlockCoeffs16& residuals_cache = residuals_cache_[ci];
     if (recompute_top_left_residuals) {
       cb_->PredictBlock(c, /*tf_i=*/0, prediction.Row(0), prediction.Step());
       cb_->GetResiduals(c, /*tf_i=*/0, prediction, &residuals_cache);

       if (c == channels_.back()) {  // Remember what is cached for all channels.
         cached_split_tf_ = combination.params.split_tf;
         cached_predictor_ = combination.params.pred;
       }
     }
     BlockCoeffs16 residuals, reconstructed;  // Temporary buffers.
     // At least one of {pred, tf} changed, so always reconstruct the top left
     // sub-block...
     cb_->TransformAndReconstruct(*config_, *context_, segment, c, /*tf_i=*/0,
                                  reduced_tf, prediction, &residuals_cache,
                                  &reconstructed, counters_);

     // ... and predict / reconstruct the other sub-blocks (that depend on
     // the results of top left one), if any.
     for (uint32_t tf_i = 1; tf_i < cb_->GetNumTransforms(c); ++tf_i) {
       cb_->PredictBlock(c, tf_i, prediction.Row(0), prediction.Step());
       cb_->GetResiduals(c, tf_i, prediction, &residuals);
       cb_->TransformAndReconstruct(*config_, *context_, segment, c, tf_i,
                                    reduced_tf, prediction, &residuals,
                                    &reconstructed, counters_);
     }
     if (c == channels_.back()) {  // Remember what is cached for all channels.
       last_params_ = combination.params;
       last_segment_id_ = combination.segment_id;
     }

     // Compute distortion.
     results.distortion = cb_->GetDisto(c);
     // TODO(yguyon): In some cases, this combination could be discarded right
     //               now because disto[ci] > best_combination_.score

     // Estimate the cost of this configuration.
     // TODO(yguyon): some rates may be skipped if all combinations lead to the
     //               same partial cost.
     results.predictor_rate = {/*is_defined=*/true,
                               cb_->PredictorRate(c, counters_->predictor())};
     results.transform_rate = {/*is_defined=*/true,
                               cb_->TransformRate(c, counters_->transform())};
     if (c == kYChannel) {
       // Segment ID is decided for luma only and signaled once per block.
       results.segment_id_rate = {
           /*is_defined=*/true,
           cb_->SegmentIdRate(*context_, counters_->segment_id())};
     } else {
       assert(segment_ids_.empty() && !results.segment_id_rate.is_defined);
     }
     results.residuals_rate = {
         /*is_defined=*/true,
         cb_->ResidualRate(*context_, c, num_channels, counters_->residuals())};
   }

   const bool is_best = (best_combination_.params.pred == nullptr ||
                         combination.IsBetterThan(best_combination_));
   if (is_best) {
     best_combination_ = combination;
   }

   if (combination.params.pred->IsAngle(nullptr)) {
     WP2_CHECK_ALLOC_OK(best_combinations_angle_.push_back(combination));
   } else if (best_combination_no_angle_.params.pred == nullptr ||
              combination.IsBetterThan(best_combination_no_angle_)) {
     best_combination_no_angle_ = combination;
   }

 #if !defined(WP2_REDUCE_BINARY_SIZE)
   // Store scores for debug.
   for (Channel c : channels_) {
     cb_->StorePredictionScore(*config_, tile_rect_, c, *combination.params.pred,
                               combination.params.tf, combination.segment_id,
                               combination.results[GetChannelIndex(c)], is_best);
   }
 #endif  // WP2_REDUCE_BINARY_SIZE

   return WP2_STATUS_OK;
 }

 const VectorNoCtor<bool>& BlockScorer::RefinePredictors() {
   // Set all predictors to "should-not-be-refined".
   std::fill(pred_mode_map_.begin(), pred_mode_map_.end(), false);
   // Only angle predictors are refined and returned.
   if (best_combinations_angle_.empty()) return pred_mode_map_;

   // Choose to refine all the main angle predictors within a percentage of the
   // lowest score. This helps refine the predictors with similar scores.
   std::sort(best_combinations_angle_.begin(), best_combinations_angle_.end());
   const float min_score = best_combinations_angle_.front().GetScore();  // Best.
   const float max_score = best_combinations_angle_.back().GetScore();  // Worst.
   const float fraction = config_->effort / 9.f;  // Max effort includes all.
   const float min_allowed_score =
       min_score * (1.f - fraction) + max_score * fraction + 1.f;
   assert(config_->effort != 9 || max_score <= min_allowed_score);

   for (const BlockScore& main_angle : best_combinations_angle_) {
     if (!best_combination_no_angle_.IsBetterThan(main_angle) ||
         main_angle.GetDistortion() <=
             best_combination_no_angle_.GetDistortion() ||
         main_angle.GetScore() <= min_allowed_score) {
       assert(main_angle.params.pred->mode() < pred_mode_map_.size());
       pred_mode_map_[main_angle.params.pred->mode()] = true;
     }
   }
   return pred_mode_map_;
 }

 const BlockScore& BlockScorer::GetBestCombination() const {
   return best_combination_;
 }

 void BlockScorer::ReconstructBestCombination() {
   assert(best_combination_.params.pred != nullptr);
   if (best_combination_.params != last_params_ ||
       best_combination_.segment_id != last_segment_id_) {
     *cb_->GetCodingParams(channels_.front()) = best_combination_.params;
     cb_->id_ = best_combination_.segment_id;
     const Segment& segment =
         gparams_->segments_[(channels_.front() == kAChannel) ? 0 : cb_->id_];
     for (Channel c : channels_) {
       const bool is420 = ((c == kUChannel || c == kVChannel) && cb_->is420_);
       cb_->QuantizeAll(*config_, *context_, segment, c, is420, counters_);
     }
   }
 }

 //------------------------------------------------------------------------------

 WP2Status OptimizeModes(const EncoderConfig& config, const Rectangle& tile_rect,
                         Channel channel, const Predictors& preds,
                         const BlockModes& m, const BlockContext& context,
                         CodedBlock* const cb, Counters* const counters,
                         BlockScorer* const scorer) {
   scorer->Set(context, cb, counters);

   // Either luma, chroma or alpha.
   WP2_CHECK_ALLOC_OK(scorer->channels_.resize((channel == kUChannel) ? 2 : 1));
   scorer->channels_.front() = channel;
   if (channel == kUChannel) scorer->channels_.back() = kVChannel;
   const bool explicit_segment_id =
       scorer->GetGlobalParams().explicit_segment_ids_ && !cb->blk().IsSmall();

   // First round of combinations.
   WP2_CHECK_ALLOC_OK(scorer->splits_.copy_from(m.splits_tried_during_preds));
   WP2_CHECK_ALLOC_OK(scorer->predictors_.copy_from(m.main_preds));
   WP2_CHECK_ALLOC_OK(scorer->transforms_.copy_from(m.tf_tried_during_preds));
   scorer->segment_ids_.clear();
   if (explicit_segment_id) {
     WP2_CHECK_ALLOC_OK(
         scorer->segment_ids_.copy_from(m.segment_ids_tried_during_preds));
   }
   WP2_CHECK_STATUS(scorer->ComputeScore());

   // Find all sub-angle predictors sharing the same mode (~angle) as the
   // best main-angle predictors.
   scorer->predictors_.clear();
   const VectorNoCtor<bool>& pred_mode_map = scorer->RefinePredictors();
   for (const Predictor* const sub_pred : m.sub_preds) {
     assert(sub_pred->IsAngle(nullptr));
     assert(std::find(m.main_preds.begin(), m.main_preds.end(), sub_pred) ==
            m.main_preds.end());
     if (pred_mode_map[sub_pred->mode()]) {
       assert(std::find(scorer->predictors_.begin(), scorer->predictors_.end(),
                        sub_pred) == scorer->predictors_.end());
       WP2_CHECK_ALLOC_OK(scorer->predictors_.push_back(sub_pred));
     }
   }
   if (!scorer->predictors_.empty()) {
     WP2_CHECK_STATUS(scorer->ComputeScore());
   }

   // Keep only the best of each from now on and try the others one by one.
   WP2_CHECK_ALLOC_OK(scorer->splits_.resize(1));
   scorer->splits_.front() = scorer->GetBestCombination().params.split_tf;
   WP2_CHECK_ALLOC_OK(scorer->predictors_.resize(1));
   scorer->predictors_.front() = scorer->GetBestCombination().params.pred;
   if (scorer->segment_ids_.empty()) {
     // Keep AssignSegmentId() value.
     assert(cb->id_ == scorer->GetBestCombination().segment_id);
   } else {
     WP2_CHECK_ALLOC_OK(scorer->segment_ids_.resize(1));
     scorer->segment_ids_.front() = scorer->GetBestCombination().segment_id;
   }

   // Try other transforms.
   if (!m.tf_tried_after_preds.empty()) {
     WP2_CHECK_ALLOC_OK(scorer->transforms_.copy_from(m.tf_tried_after_preds));
     WP2_CHECK_STATUS(scorer->ComputeScore());
     // Keep only the best transform from now on.
     WP2_CHECK_ALLOC_OK(scorer->transforms_.resize(1));
     scorer->transforms_.front() = scorer->GetBestCombination().params.tf;
   }

   // Try other segments.
   scorer->segment_ids_.clear();
   for (uint8_t segment_id : m.segment_ids_tried_after_preds) {
     // AssignSegmentId() could have returned a segment present in
     // 'm.segment_ids_tried_after_preds' so make sure there is no overlap.
     if (explicit_segment_id &&
         segment_id != scorer->GetBestCombination().segment_id) {
       WP2_CHECK_ALLOC_OK(scorer->segment_ids_.push_back(segment_id));
     }
   }
   if (!scorer->segment_ids_.empty()) {
     WP2_CHECK_STATUS(scorer->ComputeScore());
     // Keep only the best segment from now on.
     WP2_CHECK_ALLOC_OK(scorer->segment_ids_.resize(1));
     scorer->segment_ids_.front() = scorer->GetBestCombination().segment_id;
   }

   // Try other split_tf.
   if (!m.splits_tried_after_preds.empty()) {
     WP2_CHECK_ALLOC_OK(scorer->splits_.copy_from(m.splits_tried_after_preds));
     WP2_CHECK_STATUS(scorer->ComputeScore());
   }

   scorer->ReconstructBestCombination();
   return WP2_STATUS_OK;
 }

 WP2Status OptimizeModesChroma(
     const EncoderConfig& config, const Rectangle& tile_rect, bool has_alpha,
     const FrontMgrBase& mgr, const Predictors& preds,
     ChromaSubsampling chroma_subsampling, const BlockModes& modes,
     const BlockContext& context, CodedBlock* const cb, Counters* const counters,
     DCDiffusionMap* const dc_error_u, DCDiffusionMap* const dc_error_v,
     BlockScorer* const scorer) {
   const uint32_t diffusion =
       DCDiffusionMap::GetDiffusion(config.error_diffusion);
   const bool is420_for_sure =
       ((chroma_subsampling == ChromaSubsampling::kSingleBlock &&
         config.uv_mode == EncoderConfig::UVMode420) ||
        chroma_subsampling == ChromaSubsampling::k420);
   const bool is444_for_sure =
       ((chroma_subsampling == ChromaSubsampling::kSingleBlock &&
         config.uv_mode == EncoderConfig::UVMode444) ||
        chroma_subsampling == ChromaSubsampling::k444);
   const Segment& segment = scorer->GetGlobalParams().segments_[cb->id_];

   cb->is420_ = is420_for_sure;  // Must be set to something now even if unsure.
   WP2_CHECK_STATUS(OptimizeModes(config, tile_rect, kUChannel, preds, modes,
                                  context, cb, counters, scorer));
   if (diffusion > 0) {  // Add error diffusion
     cb->dc_error_[kUChannel] = dc_error_u->Get(cb->blk(), diffusion);
     cb->dc_error_[kVChannel] = dc_error_v->Get(cb->blk(), diffusion);
   }

   // Choose the final value of 'cb->is420_'.
   // TODO(skal): decide based on coded luma too?
   if (is420_for_sure) {
     cb->is420_ = true;
   } else if (is444_for_sure) {
     cb->is420_ = false;
   } else {
     assert(chroma_subsampling == ChromaSubsampling::kSingleBlock ||
            chroma_subsampling == ChromaSubsampling::kAdaptive);
     WP2_CHECK_STATUS(cb->DecideChromaSubsampling(
         config, context, tile_rect.x, tile_rect.y, has_alpha, segment.quant_u_,
         segment.quant_v_, counters));
   }
   if (diffusion > 0 || (!is420_for_sure && !is444_for_sure)) {
     cb->QuantizeAll(config, context, segment, kUChannel, cb->is420_, counters);
     cb->QuantizeAll(config, context, segment, kVChannel, cb->is420_, counters);
   }
   if (diffusion > 0) {
     dc_error_u->Store(mgr, cb->blk(), cb->dc_error_next_[kUChannel]);
     dc_error_v->Store(mgr, cb->blk(), cb->dc_error_next_[kVChannel]);
   }
   return WP2_STATUS_OK;
 }

 //------------------------------------------------------------------------------

 }  // namespace WP2