Merge "remove idct32x32*_add_neon.asm"
diff --git a/.gitignore b/.gitignore
index 901e8c3..4337a2c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,6 +37,7 @@
 /examples/twopass_encoder
 /examples/vp8_multi_resolution_encoder
 /examples/vp8cx_set_ref
+/examples/vp9cx_set_ref
 /examples/vp9_lossless_encoder
 /examples/vp9_spatial_scalable_encoder
 /examples/vpx_temporal_scalable_patterns
diff --git a/test/codec_factory.h b/test/codec_factory.h
index 3415284..d5882ed 100644
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -65,6 +65,12 @@
     : public ::testing::TestWithParam<
           std::tr1::tuple<const libvpx_test::CodecFactory *, T1, T2, T3> > {};
 
+template <class T1, class T2, class T3, class T4>
+class CodecTestWith4Params
+    : public ::testing::TestWithParam<
+          std::tr1::tuple<const libvpx_test::CodecFactory *, T1, T2, T3, T4> > {
+};
+
 /*
  * VP8 Codec Definitions
  */
diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc
index 0590487..804dc89 100644
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc
@@ -20,10 +20,12 @@
 namespace {
 class VPxEncoderThreadTest
     : public ::libvpx_test::EncoderTest,
-      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+      public ::libvpx_test::CodecTestWith4Params<libvpx_test::TestMode, int,
+                                                 int, int> {
  protected:
   VPxEncoderThreadTest()
-      : EncoderTest(GET_PARAM(0)), encoder_initialized_(false), tiles_(2),
+      : EncoderTest(GET_PARAM(0)), encoder_initialized_(false),
+        tiles_(GET_PARAM(3)), threads_(GET_PARAM(4)),
         encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)) {
     init_flags_ = VPX_CODEC_USE_PSNR;
     md5_.clear();
@@ -91,6 +93,7 @@
 
   bool encoder_initialized_;
   int tiles_;
+  int threads_;
   ::libvpx_test::TestMode encoding_mode_;
   int set_cpu_used_;
   std::vector<std::string> md5_;
@@ -111,7 +114,7 @@
   md5_.clear();
 
   // Encode using multiple threads.
-  cfg_.g_threads = 4;
+  cfg_.g_threads = threads_;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   multi_thr_md5 = md5_;
   md5_.clear();
@@ -124,5 +127,7 @@
                           ::testing::Values(::libvpx_test::kTwoPassGood,
                                             ::libvpx_test::kOnePassGood,
                                             ::libvpx_test::kRealTime),
-                          ::testing::Range(1, 9));
+                          ::testing::Range(1, 9),   // cpu_used
+                          ::testing::Range(0, 3),   // tile_columns
+                          ::testing::Range(2, 5));  // threads
 }  // namespace
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index c0de390..e89247a 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -1011,12 +1011,13 @@
       // gf_noboost_onepass_cbr = 1, which forces the gf to use the same
       // rate correction factor as last.
       cpi->gf_noboost_onepass_cbr = (cpi->oxcf.gf_cbr_boost_pct <= 100);
-      cpi->this_frame_target =
-          (cpi->this_frame_target * (100 + cpi->oxcf.gf_cbr_boost_pct)) / 100;
       cpi->baseline_gf_interval = cpi->gf_interval_onepass_cbr;
       // Skip this update if the zero_mvcount is low.
-      if (cpi->zeromv_count > (cpi->common.MBs >> 1))
+      if (cpi->zeromv_count > (cpi->common.MBs >> 1)) {
         cpi->common.refresh_golden_frame = 1;
+        cpi->this_frame_target =
+            (cpi->this_frame_target * (100 + cpi->oxcf.gf_cbr_boost_pct)) / 100;
+      }
       cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
       cpi->current_gf_interval = cpi->frames_till_gf_update_due;
     }
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 22b28de..49aea69 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -915,6 +915,125 @@
   }
 }
 
+static int encode_tile_worker(VP9_COMP *cpi, VP9BitstreamWorkerData *data) {
+  MACROBLOCKD *const xd = &data->xd;
+  vpx_start_encode(&data->bit_writer, data->dest);
+  write_modes(cpi, xd, &cpi->tile_data[data->tile_idx].tile_info,
+              &data->bit_writer, &data->tok, data->tok_end,
+              &data->max_mv_magnitude, data->interp_filter_selected);
+  assert(data->tok == data->tok_end);
+  vpx_stop_encode(&data->bit_writer);
+  return 1;
+}
+
+void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi) {
+  if (cpi->vp9_bitstream_worker_data) {
+    int i;
+    for (i = 1; i < cpi->num_workers; ++i) {
+      vpx_free(cpi->vp9_bitstream_worker_data[i].dest);
+    }
+    vpx_free(cpi->vp9_bitstream_worker_data);
+    cpi->vp9_bitstream_worker_data = NULL;
+  }
+}
+
+static int encode_tiles_buffer_alloc(VP9_COMP *const cpi) {
+  int i;
+  const size_t worker_data_size =
+      cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data);
+  cpi->vp9_bitstream_worker_data = vpx_memalign(16, worker_data_size);
+  memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size);
+  if (!cpi->vp9_bitstream_worker_data) return 1;
+  for (i = 1; i < cpi->num_workers; ++i) {
+    cpi->vp9_bitstream_worker_data[i].dest_size =
+        cpi->oxcf.width * cpi->oxcf.height;
+    cpi->vp9_bitstream_worker_data[i].dest =
+        vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size);
+    if (!cpi->vp9_bitstream_worker_data[i].dest) return 1;
+  }
+  return 0;
+}
+
+static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) {
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int num_workers = cpi->num_workers;
+  size_t total_size = 0;
+  int tile_col = 0;
+
+  if (!cpi->vp9_bitstream_worker_data ||
+      cpi->vp9_bitstream_worker_data[1].dest_size >
+          (cpi->oxcf.width * cpi->oxcf.height)) {
+    vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
+    if (encode_tiles_buffer_alloc(cpi)) return 0;
+  }
+
+  while (tile_col < tile_cols) {
+    int i, j;
+    for (i = 0; i < num_workers && tile_col < tile_cols; ++i) {
+      VPxWorker *const worker = &cpi->workers[i];
+      VP9BitstreamWorkerData *const data = &cpi->vp9_bitstream_worker_data[i];
+
+      // Populate the worker data.
+      data->xd = cpi->td.mb.e_mbd;
+      data->tile_idx = tile_col;
+      data->tok = cpi->tile_tok[0][tile_col];
+      data->tok_end = cpi->tile_tok[0][tile_col] + cpi->tok_count[0][tile_col];
+      data->max_mv_magnitude = cpi->max_mv_magnitude;
+      memset(data->interp_filter_selected, 0,
+             sizeof(data->interp_filter_selected[0][0]) * SWITCHABLE);
+
+      // First thread can directly write into the output buffer.
+      if (i == 0) {
+        // If this worker happens to be for the last tile, then do not offset it
+        // by 4 for the tile size.
+        data->dest =
+            data_ptr + total_size + (tile_col == tile_cols - 1 ? 0 : 4);
+      }
+      worker->data1 = cpi;
+      worker->data2 = data;
+      worker->hook = (VPxWorkerHook)encode_tile_worker;
+      worker->had_error = 0;
+
+      if (i < num_workers - 1) {
+        winterface->launch(worker);
+      } else {
+        winterface->execute(worker);
+      }
+      ++tile_col;
+    }
+    for (j = 0; j < i; ++j) {
+      VPxWorker *const worker = &cpi->workers[j];
+      VP9BitstreamWorkerData *const data =
+          (VP9BitstreamWorkerData *)worker->data2;
+      uint32_t tile_size;
+      int k;
+
+      if (!winterface->sync(worker)) return 0;
+      tile_size = data->bit_writer.pos;
+
+      // Aggregate per-thread bitstream stats.
+      cpi->max_mv_magnitude =
+          VPXMAX(cpi->max_mv_magnitude, data->max_mv_magnitude);
+      for (k = 0; k < SWITCHABLE; ++k) {
+        cpi->interp_filter_selected[0][k] += data->interp_filter_selected[0][k];
+      }
+
+      // Prefix the size of the tile on all but the last.
+      if (tile_col != tile_cols || j < i - 1) {
+        mem_put_be32(data_ptr + total_size, tile_size);
+        total_size += 4;
+      }
+      if (j > 0) {
+        memcpy(data_ptr + total_size, data->dest, tile_size);
+      }
+      total_size += tile_size;
+    }
+  }
+  return total_size;
+}
+
 static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
@@ -928,6 +1047,14 @@
   memset(cm->above_seg_context, 0,
          sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
 
+  // Encoding tiles in parallel is done only for realtime mode now. In other
+  // modes the speed up is insignificant and requires further testing to ensure
+  // that it does not make the overall process worse in any case.
+  if (cpi->oxcf.mode == REALTIME && cpi->num_workers > 1 && tile_rows == 1 &&
+      tile_cols > 1) {
+    return encode_tiles_mt(cpi, data_ptr);
+  }
+
   for (tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
       int tile_idx = tile_row * tile_cols + tile_col;
@@ -955,7 +1082,6 @@
       total_size += residual_bc.pos;
     }
   }
-
   return total_size;
 }
 
diff --git a/vp9/encoder/vp9_bitstream.h b/vp9/encoder/vp9_bitstream.h
index 8c97d37..044a3bb 100644
--- a/vp9/encoder/vp9_bitstream.h
+++ b/vp9/encoder/vp9_bitstream.h
@@ -17,8 +17,26 @@
 
 #include "vp9/encoder/vp9_encoder.h"
 
+typedef struct VP9BitstreamWorkerData {
+  uint8_t *dest;
+  int dest_size;
+  TOKENEXTRA *tok;
+  TOKENEXTRA *tok_end;
+  vpx_writer bit_writer;
+  int tile_idx;
+  unsigned int max_mv_magnitude;
+  // The size of interp_filter_selected in VP9_COMP is actually
+  // MAX_REFERENCE_FRAMES x SWITCHABLE. But when encoding tiles, all we ever do
+  // is increment the very first index (index 0) for the first dimension. Hence
+  // this is sufficient.
+  int interp_filter_selected[1][SWITCHABLE];
+  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+} VP9BitstreamWorkerData;
+
 int vp9_get_refresh_mask(VP9_COMP *cpi);
 
+void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi);
+
 void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
 
 static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 36e288e..aff7b1d 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -799,6 +799,7 @@
   int min_var_32x32 = INT_MAX;
   int var_32x32;
   int avg_16x16[4];
+  int64_t threshold_4x4avg;
   NOISE_LEVEL noise_level = kLow;
   uint8_t *s;
   const uint8_t *d;
@@ -833,6 +834,9 @@
     }
   }
 
+  threshold_4x4avg =
+      (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : thresholds[2] >> 1;
+
   memset(x->variance_low, 0, sizeof(x->variance_low));
 
   if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
@@ -999,7 +1003,7 @@
       }
       if (is_key_frame || (low_res &&
                            vt.split[i].split[j].part_variances.none.variance >
-                               (thresholds[1] << 1))) {
+                               threshold_4x4avg)) {
         force_split[split_index] = 0;
         // Go down to 4x4 down-sampling for variance.
         variance4x4downsample[i2 + j] = 1;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index d98c493..43b708b 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2030,7 +2030,10 @@
   vpx_free(cpi->tile_thr_data);
   vpx_free(cpi->workers);
 
-  if (cpi->num_workers > 1) vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+  if (cpi->num_workers > 1) {
+    vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+    vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
+  }
 
   vp9_alt_ref_aq_destroy(cpi->alt_ref_aq);
 
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 77eb31c..e353d47 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -601,6 +601,7 @@
   VPxWorker *workers;
   struct EncWorkerData *tile_thr_data;
   VP9LfSync lf_row_sync;
+  struct VP9BitstreamWorkerData *vp9_bitstream_worker_data;
 
   int keep_level_stats;
   Vp9LevelInfo level_info;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 6cbb59f..3e1ed50 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -508,7 +508,6 @@
       sf->short_circuit_low_temp_var = 2;
     }
     sf->limit_newmv_early_exit = 0;
-    sf->bias_golden = 0;
   }
 }
 
diff --git a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
index 68d5777..a980ab1 100644
--- a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
+++ b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
@@ -49,44 +49,55 @@
     } while (h > 0);
   } else if (w < 32) {  // copy16
     do {
-      vst1q_u16(dst, vld1q_u16(src));
-      vst1q_u16(dst + 8, vld1q_u16(src + 8));
+      vst2q_u16(dst, vld2q_u16(src));
       src += src_stride;
       dst += dst_stride;
-      vst1q_u16(dst, vld1q_u16(src));
-      vst1q_u16(dst + 8, vld1q_u16(src + 8));
+      vst2q_u16(dst, vld2q_u16(src));
       src += src_stride;
       dst += dst_stride;
-      h -= 2;
+      vst2q_u16(dst, vld2q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      vst2q_u16(dst, vld2q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 4;
     } while (h > 0);
   } else if (w == 32) {  // copy32
     do {
-      vst1q_u16(dst, vld1q_u16(src));
-      vst1q_u16(dst + 8, vld1q_u16(src + 8));
-      vst1q_u16(dst + 16, vld1q_u16(src + 16));
-      vst1q_u16(dst + 24, vld1q_u16(src + 24));
+      vst4q_u16(dst, vld4q_u16(src));
       src += src_stride;
       dst += dst_stride;
-      vst1q_u16(dst, vld1q_u16(src));
-      vst1q_u16(dst + 8, vld1q_u16(src + 8));
-      vst1q_u16(dst + 16, vld1q_u16(src + 16));
-      vst1q_u16(dst + 24, vld1q_u16(src + 24));
+      vst4q_u16(dst, vld4q_u16(src));
       src += src_stride;
       dst += dst_stride;
-      h -= 2;
+      vst4q_u16(dst, vld4q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      vst4q_u16(dst, vld4q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 4;
     } while (h > 0);
   } else {  // copy64
     do {
-      vst1q_u16(dst, vld1q_u16(src));
-      vst1q_u16(dst + 8, vld1q_u16(src + 8));
-      vst1q_u16(dst + 16, vld1q_u16(src + 16));
-      vst1q_u16(dst + 24, vld1q_u16(src + 24));
-      vst1q_u16(dst + 32, vld1q_u16(src + 32));
-      vst1q_u16(dst + 40, vld1q_u16(src + 40));
-      vst1q_u16(dst + 48, vld1q_u16(src + 48));
-      vst1q_u16(dst + 56, vld1q_u16(src + 56));
+      vst4q_u16(dst, vld4q_u16(src));
+      vst4q_u16(dst + 32, vld4q_u16(src + 32));
       src += src_stride;
       dst += dst_stride;
-    } while (--h);
+      vst4q_u16(dst, vld4q_u16(src));
+      vst4q_u16(dst + 32, vld4q_u16(src + 32));
+      src += src_stride;
+      dst += dst_stride;
+      vst4q_u16(dst, vld4q_u16(src));
+      vst4q_u16(dst + 32, vld4q_u16(src + 32));
+      src += src_stride;
+      dst += dst_stride;
+      vst4q_u16(dst, vld4q_u16(src));
+      vst4q_u16(dst + 32, vld4q_u16(src + 32));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 4;
+    } while (h > 0);
   }
 }