Merge "asm_*_offsets to define variables as constants" into eider
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 3c772e5..332593a 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -967,7 +967,7 @@
                 esac
                 ;;
             gcc*)
-                add_cflags  -m${bits}
+                add_cflags -m${bits}
                 add_ldflags -m${bits}
                 link_with_cc=gcc
                 tune_cflags="-march="
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index a4c1d92..7c648da 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -216,12 +216,6 @@
     MODE_INFO *mode_info_context;
     int mode_info_stride;
 
-#if CONFIG_TEMPORAL_DENOISING
-    MB_PREDICTION_MODE best_sse_inter_mode;
-    int_mv best_sse_mv;
-    unsigned char need_to_clamp_best_mvs;
-#endif
-
     FRAME_TYPE frame_type;
 
     int up_available;
diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh
index 33bf08b..d6cbd4a 100644
--- a/vp8/common/rtcd_defs.sh
+++ b/vp8/common/rtcd_defs.sh
@@ -501,6 +501,14 @@
 prototype void vp8_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
 specialize vp8_yv12_copy_partial_frame neon
 
+#
+# Denoiser filter
+#
+if [ "$CONFIG_TEMPORAL_DENOISING" = "yes" ]; then
+    prototype int vp8_denoiser_filter "struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset"
+    specialize vp8_denoiser_filter sse2
+fi
+
 # End of encoder only functions
 fi
 
diff --git a/vp8/common/x86/dequantize_mmx.asm b/vp8/common/x86/dequantize_mmx.asm
index de9eba8..4e551f0 100644
--- a/vp8/common/x86/dequantize_mmx.asm
+++ b/vp8/common/x86/dequantize_mmx.asm
@@ -13,7 +13,7 @@
 
 
 ;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
-global sym(vp8_dequantize_b_impl_mmx)
+global sym(vp8_dequantize_b_impl_mmx) PRIVATE
 sym(vp8_dequantize_b_impl_mmx):
     push        rbp
     mov         rbp, rsp
@@ -55,7 +55,7 @@
 ;short *dq,               1
 ;unsigned char *dest,     2
 ;int stride)              3
-global sym(vp8_dequant_idct_add_mmx)
+global sym(vp8_dequant_idct_add_mmx) PRIVATE
 sym(vp8_dequant_idct_add_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/common/x86/idctllm_mmx.asm b/vp8/common/x86/idctllm_mmx.asm
index 0c9c205..96fa2c6 100644
--- a/vp8/common/x86/idctllm_mmx.asm
+++ b/vp8/common/x86/idctllm_mmx.asm
@@ -34,7 +34,7 @@
 
 ;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
 ;int pitch, unsigned char *dest,int stride)
-global sym(vp8_short_idct4x4llm_mmx)
+global sym(vp8_short_idct4x4llm_mmx) PRIVATE
 sym(vp8_short_idct4x4llm_mmx):
     push        rbp
     mov         rbp, rsp
@@ -224,7 +224,7 @@
 ;int pred_stride,
 ;unsigned char *dst_ptr,
 ;int stride)
-global sym(vp8_dc_only_idct_add_mmx)
+global sym(vp8_dc_only_idct_add_mmx) PRIVATE
 sym(vp8_dc_only_idct_add_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/common/x86/idctllm_sse2.asm b/vp8/common/x86/idctllm_sse2.asm
index abeb0b6..bf8e2c4 100644
--- a/vp8/common/x86/idctllm_sse2.asm
+++ b/vp8/common/x86/idctllm_sse2.asm
@@ -19,7 +19,7 @@
 ;   int dst_stride      - 3
 ; )
 
-global sym(vp8_idct_dequant_0_2x_sse2)
+global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
 sym(vp8_idct_dequant_0_2x_sse2):
     push        rbp
     mov         rbp, rsp
@@ -101,7 +101,7 @@
 ;   unsigned char *dst  - 2
 ;   int dst_stride      - 3
 ; )
-global sym(vp8_idct_dequant_full_2x_sse2)
+global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
 sym(vp8_idct_dequant_full_2x_sse2):
     push        rbp
     mov         rbp, rsp
@@ -358,7 +358,7 @@
 ;   int dst_stride      - 3
 ;   short *dc           - 4
 ; )
-global sym(vp8_idct_dequant_dc_0_2x_sse2)
+global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
 sym(vp8_idct_dequant_dc_0_2x_sse2):
     push        rbp
     mov         rbp, rsp
@@ -434,7 +434,7 @@
 ;   int dst_stride      - 3
 ;   short *dc           - 4
 ; )
-global sym(vp8_idct_dequant_dc_full_2x_sse2)
+global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
 sym(vp8_idct_dequant_dc_full_2x_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/common/x86/iwalsh_mmx.asm b/vp8/common/x86/iwalsh_mmx.asm
index 6582687..4aac094 100644
--- a/vp8/common/x86/iwalsh_mmx.asm
+++ b/vp8/common/x86/iwalsh_mmx.asm
@@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 ;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
-global sym(vp8_short_inv_walsh4x4_mmx)
+global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE
 sym(vp8_short_inv_walsh4x4_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/common/x86/iwalsh_sse2.asm b/vp8/common/x86/iwalsh_sse2.asm
index 51cb5e2..06e86a8 100644
--- a/vp8/common/x86/iwalsh_sse2.asm
+++ b/vp8/common/x86/iwalsh_sse2.asm
@@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 ;void vp8_short_inv_walsh4x4_sse2(short *input, short *output)
-global sym(vp8_short_inv_walsh4x4_sse2)
+global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE
 sym(vp8_short_inv_walsh4x4_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/common/x86/loopfilter_block_sse2.asm b/vp8/common/x86/loopfilter_block_sse2.asm
index 4918eb5..1c445ef 100644
--- a/vp8/common/x86/loopfilter_block_sse2.asm
+++ b/vp8/common/x86/loopfilter_block_sse2.asm
@@ -133,7 +133,7 @@
 ;    const char    *limit,
 ;    const char    *thresh
 ;)
-global sym(vp8_loop_filter_bh_y_sse2)
+global sym(vp8_loop_filter_bh_y_sse2) PRIVATE
 sym(vp8_loop_filter_bh_y_sse2):
 
 %ifidn __OUTPUT_FORMAT__,x64
@@ -273,7 +273,7 @@
 ;    const char    *thresh
 ;)
 
-global sym(vp8_loop_filter_bv_y_sse2)
+global sym(vp8_loop_filter_bv_y_sse2) PRIVATE
 sym(vp8_loop_filter_bv_y_sse2):
 
 %ifidn __OUTPUT_FORMAT__,x64
diff --git a/vp8/common/x86/loopfilter_mmx.asm b/vp8/common/x86/loopfilter_mmx.asm
index 697a5de..f388d24 100644
--- a/vp8/common/x86/loopfilter_mmx.asm
+++ b/vp8/common/x86/loopfilter_mmx.asm
@@ -21,7 +21,7 @@
 ;    const char *thresh,
 ;    int  count
 ;)
-global sym(vp8_loop_filter_horizontal_edge_mmx)
+global sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE
 sym(vp8_loop_filter_horizontal_edge_mmx):
     push        rbp
     mov         rbp, rsp
@@ -233,7 +233,7 @@
 ;    const char *thresh,
 ;    int count
 ;)
-global sym(vp8_loop_filter_vertical_edge_mmx)
+global sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE
 sym(vp8_loop_filter_vertical_edge_mmx):
     push        rbp
     mov         rbp, rsp
@@ -603,7 +603,7 @@
 ;    const char *thresh,
 ;    int count
 ;)
-global sym(vp8_mbloop_filter_horizontal_edge_mmx)
+global sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE
 sym(vp8_mbloop_filter_horizontal_edge_mmx):
     push        rbp
     mov         rbp, rsp
@@ -920,7 +920,7 @@
 ;    const char *thresh,
 ;    int count
 ;)
-global sym(vp8_mbloop_filter_vertical_edge_mmx)
+global sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE
 sym(vp8_mbloop_filter_vertical_edge_mmx):
     push        rbp
     mov         rbp, rsp
@@ -1384,7 +1384,7 @@
 ;    int  src_pixel_step,
 ;    const char *blimit
 ;)
-global sym(vp8_loop_filter_simple_horizontal_edge_mmx)
+global sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE
 sym(vp8_loop_filter_simple_horizontal_edge_mmx):
     push        rbp
     mov         rbp, rsp
@@ -1500,7 +1500,7 @@
 ;    int  src_pixel_step,
 ;    const char *blimit
 ;)
-global sym(vp8_loop_filter_simple_vertical_edge_mmx)
+global sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE
 sym(vp8_loop_filter_simple_vertical_edge_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm
index 9944c33..a66753b 100644
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -286,7 +286,7 @@
 ;    const char    *limit,
 ;    const char    *thresh,
 ;)
-global sym(vp8_loop_filter_horizontal_edge_sse2)
+global sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE
 sym(vp8_loop_filter_horizontal_edge_sse2):
     push        rbp
     mov         rbp, rsp
@@ -334,7 +334,7 @@
 ;    const char    *thresh,
 ;    int            count
 ;)
-global sym(vp8_loop_filter_horizontal_edge_uv_sse2)
+global sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE
 sym(vp8_loop_filter_horizontal_edge_uv_sse2):
     push        rbp
     mov         rbp, rsp
@@ -561,7 +561,7 @@
 ;    const char    *limit,
 ;    const char    *thresh,
 ;)
-global sym(vp8_mbloop_filter_horizontal_edge_sse2)
+global sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE
 sym(vp8_mbloop_filter_horizontal_edge_sse2):
     push        rbp
     mov         rbp, rsp
@@ -607,7 +607,7 @@
 ;    const char    *thresh,
 ;    unsigned char *v
 ;)
-global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2)
+global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE
 sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
     push        rbp
     mov         rbp, rsp
@@ -928,7 +928,7 @@
 ;    const char    *limit,
 ;    const char    *thresh,
 ;)
-global sym(vp8_loop_filter_vertical_edge_sse2)
+global sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE
 sym(vp8_loop_filter_vertical_edge_sse2):
     push        rbp
     mov         rbp, rsp
@@ -993,7 +993,7 @@
 ;    const char    *thresh,
 ;    unsigned char *v
 ;)
-global sym(vp8_loop_filter_vertical_edge_uv_sse2)
+global sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE
 sym(vp8_loop_filter_vertical_edge_uv_sse2):
     push        rbp
     mov         rbp, rsp
@@ -1142,7 +1142,7 @@
 ;    const char    *limit,
 ;    const char    *thresh,
 ;)
-global sym(vp8_mbloop_filter_vertical_edge_sse2)
+global sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE
 sym(vp8_mbloop_filter_vertical_edge_sse2):
     push        rbp
     mov         rbp, rsp
@@ -1209,7 +1209,7 @@
 ;    const char    *thresh,
 ;    unsigned char *v
 ;)
-global sym(vp8_mbloop_filter_vertical_edge_uv_sse2)
+global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE
 sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
     push        rbp
     mov         rbp, rsp
@@ -1269,7 +1269,7 @@
 ;    int  src_pixel_step,
 ;    const char *blimit,
 ;)
-global sym(vp8_loop_filter_simple_horizontal_edge_sse2)
+global sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE
 sym(vp8_loop_filter_simple_horizontal_edge_sse2):
     push        rbp
     mov         rbp, rsp
@@ -1374,7 +1374,7 @@
 ;    int  src_pixel_step,
 ;    const char *blimit,
 ;)
-global sym(vp8_loop_filter_simple_vertical_edge_sse2)
+global sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE
 sym(vp8_loop_filter_simple_vertical_edge_sse2):
     push        rbp         ; save old base pointer value.
     mov         rbp, rsp    ; set new base pointer value.
diff --git a/vp8/common/x86/mfqe_sse2.asm b/vp8/common/x86/mfqe_sse2.asm
index 10d21f3..c1d2174 100644
--- a/vp8/common/x86/mfqe_sse2.asm
+++ b/vp8/common/x86/mfqe_sse2.asm
@@ -19,7 +19,7 @@
 ;    int            dst_stride,
 ;    int            src_weight
 ;)
-global sym(vp8_filter_by_weight16x16_sse2)
+global sym(vp8_filter_by_weight16x16_sse2) PRIVATE
 sym(vp8_filter_by_weight16x16_sse2):
     push        rbp
     mov         rbp, rsp
@@ -97,7 +97,7 @@
 ;    int            dst_stride,
 ;    int            src_weight
 ;)
-global sym(vp8_filter_by_weight8x8_sse2)
+global sym(vp8_filter_by_weight8x8_sse2) PRIVATE
 sym(vp8_filter_by_weight8x8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -165,7 +165,7 @@
 ;    unsigned int  *variance,      4
 ;    unsigned int  *sad,           5
 ;)
-global sym(vp8_variance_and_sad_16x16_sse2)
+global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE
 sym(vp8_variance_and_sad_16x16_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/common/x86/postproc_mmx.asm b/vp8/common/x86/postproc_mmx.asm
index d24f740..534f296 100644
--- a/vp8/common/x86/postproc_mmx.asm
+++ b/vp8/common/x86/postproc_mmx.asm
@@ -24,7 +24,7 @@
 ;    int cols,
 ;    int flimit
 ;)
-global sym(vp8_post_proc_down_and_across_mmx)
+global sym(vp8_post_proc_down_and_across_mmx) PRIVATE
 sym(vp8_post_proc_down_and_across_mmx):
     push        rbp
     mov         rbp, rsp
@@ -282,7 +282,7 @@
 ;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
 ;                             int pitch, int rows, int cols,int flimit)
 extern sym(vp8_rv)
-global sym(vp8_mbpost_proc_down_mmx)
+global sym(vp8_mbpost_proc_down_mmx) PRIVATE
 sym(vp8_mbpost_proc_down_mmx):
     push        rbp
     mov         rbp, rsp
@@ -510,7 +510,7 @@
 ;                            unsigned char bothclamp[16],
 ;                            unsigned int Width, unsigned int Height, int Pitch)
 extern sym(rand)
-global sym(vp8_plane_add_noise_mmx)
+global sym(vp8_plane_add_noise_mmx) PRIVATE
 sym(vp8_plane_add_noise_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/common/x86/postproc_sse2.asm b/vp8/common/x86/postproc_sse2.asm
index 966aafd..bf36b0d 100644
--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vp8/common/x86/postproc_sse2.asm
@@ -21,7 +21,7 @@
 ;    int cols,
 ;    int flimit
 ;)
-global sym(vp8_post_proc_down_and_across_xmm)
+global sym(vp8_post_proc_down_and_across_xmm) PRIVATE
 sym(vp8_post_proc_down_and_across_xmm):
     push        rbp
     mov         rbp, rsp
@@ -269,7 +269,7 @@
 ;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
 ;                            int pitch, int rows, int cols,int flimit)
 extern sym(vp8_rv)
-global sym(vp8_mbpost_proc_down_xmm)
+global sym(vp8_mbpost_proc_down_xmm) PRIVATE
 sym(vp8_mbpost_proc_down_xmm):
     push        rbp
     mov         rbp, rsp
@@ -497,7 +497,7 @@
 
 ;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
 ;                                int pitch, int rows, int cols,int flimit)
-global sym(vp8_mbpost_proc_across_ip_xmm)
+global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE
 sym(vp8_mbpost_proc_across_ip_xmm):
     push        rbp
     mov         rbp, rsp
@@ -694,7 +694,7 @@
 ;                            unsigned char bothclamp[16],
 ;                            unsigned int Width, unsigned int Height, int Pitch)
 extern sym(rand)
-global sym(vp8_plane_add_noise_wmt)
+global sym(vp8_plane_add_noise_wmt) PRIVATE
 sym(vp8_plane_add_noise_wmt):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/common/x86/recon_mmx.asm b/vp8/common/x86/recon_mmx.asm
index 19c0faf..15e9871 100644
--- a/vp8/common/x86/recon_mmx.asm
+++ b/vp8/common/x86/recon_mmx.asm
@@ -18,7 +18,7 @@
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
-global sym(vp8_copy_mem8x8_mmx)
+global sym(vp8_copy_mem8x8_mmx) PRIVATE
 sym(vp8_copy_mem8x8_mmx):
     push        rbp
     mov         rbp, rsp
@@ -81,7 +81,7 @@
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
-global sym(vp8_copy_mem8x4_mmx)
+global sym(vp8_copy_mem8x4_mmx) PRIVATE
 sym(vp8_copy_mem8x4_mmx):
     push        rbp
     mov         rbp, rsp
@@ -125,7 +125,7 @@
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
-global sym(vp8_copy_mem16x16_mmx)
+global sym(vp8_copy_mem16x16_mmx) PRIVATE
 sym(vp8_copy_mem16x16_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm
index 7b6e3cf..fe77450 100644
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -17,7 +17,7 @@
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
-global sym(vp8_copy_mem16x16_sse2)
+global sym(vp8_copy_mem16x16_sse2) PRIVATE
 sym(vp8_copy_mem16x16_sse2):
     push        rbp
     mov         rbp, rsp
@@ -123,7 +123,7 @@
 ;    unsigned char *left,
 ;    int left_stride,
 ;    )
-global sym(vp8_intra_pred_uv_dc_mmx2)
+global sym(vp8_intra_pred_uv_dc_mmx2) PRIVATE
 sym(vp8_intra_pred_uv_dc_mmx2):
     push        rbp
     mov         rbp, rsp
@@ -196,7 +196,7 @@
 ;    unsigned char *left,
 ;    int left_stride,
 ;    )
-global sym(vp8_intra_pred_uv_dctop_mmx2)
+global sym(vp8_intra_pred_uv_dctop_mmx2) PRIVATE
 sym(vp8_intra_pred_uv_dctop_mmx2):
     push        rbp
     mov         rbp, rsp
@@ -250,7 +250,7 @@
 ;    unsigned char *left,
 ;    int left_stride,
 ;    )
-global sym(vp8_intra_pred_uv_dcleft_mmx2)
+global sym(vp8_intra_pred_uv_dcleft_mmx2) PRIVATE
 sym(vp8_intra_pred_uv_dcleft_mmx2):
     push        rbp
     mov         rbp, rsp
@@ -317,7 +317,7 @@
 ;    unsigned char *left,
 ;    int left_stride,
 ;    )
-global sym(vp8_intra_pred_uv_dc128_mmx)
+global sym(vp8_intra_pred_uv_dc128_mmx) PRIVATE
 sym(vp8_intra_pred_uv_dc128_mmx):
     push        rbp
     mov         rbp, rsp
@@ -357,7 +357,7 @@
 ;    int left_stride,
 ;    )
 %macro vp8_intra_pred_uv_tm 1
-global sym(vp8_intra_pred_uv_tm_%1)
+global sym(vp8_intra_pred_uv_tm_%1) PRIVATE
 sym(vp8_intra_pred_uv_tm_%1):
     push        rbp
     mov         rbp, rsp
@@ -437,7 +437,7 @@
 ;    unsigned char *left,
 ;    int left_stride,
 ;    )
-global sym(vp8_intra_pred_uv_ve_mmx)
+global sym(vp8_intra_pred_uv_ve_mmx) PRIVATE
 sym(vp8_intra_pred_uv_ve_mmx):
     push        rbp
     mov         rbp, rsp
@@ -479,7 +479,7 @@
 ;    int left_stride
 ;    )
 %macro vp8_intra_pred_uv_ho 1
-global sym(vp8_intra_pred_uv_ho_%1)
+global sym(vp8_intra_pred_uv_ho_%1) PRIVATE
 sym(vp8_intra_pred_uv_ho_%1):
     push        rbp
     mov         rbp, rsp
@@ -577,7 +577,7 @@
 ;    unsigned char *left,
 ;    int left_stride
 ;    )
-global sym(vp8_intra_pred_y_dc_sse2)
+global sym(vp8_intra_pred_y_dc_sse2) PRIVATE
 sym(vp8_intra_pred_y_dc_sse2):
     push        rbp
     mov         rbp, rsp
@@ -683,7 +683,7 @@
 ;    unsigned char *left,
 ;    int left_stride
 ;    )
-global sym(vp8_intra_pred_y_dctop_sse2)
+global sym(vp8_intra_pred_y_dctop_sse2) PRIVATE
 sym(vp8_intra_pred_y_dctop_sse2):
     push        rbp
     mov         rbp, rsp
@@ -745,7 +745,7 @@
 ;    unsigned char *left,
 ;    int left_stride
 ;    )
-global sym(vp8_intra_pred_y_dcleft_sse2)
+global sym(vp8_intra_pred_y_dcleft_sse2) PRIVATE
 sym(vp8_intra_pred_y_dcleft_sse2):
     push        rbp
     mov         rbp, rsp
@@ -838,7 +838,7 @@
 ;    unsigned char *left,
 ;    int left_stride
 ;    )
-global sym(vp8_intra_pred_y_dc128_sse2)
+global sym(vp8_intra_pred_y_dc128_sse2) PRIVATE
 sym(vp8_intra_pred_y_dc128_sse2):
     push        rbp
     mov         rbp, rsp
@@ -885,7 +885,7 @@
 ;    int left_stride
 ;    )
 %macro vp8_intra_pred_y_tm 1
-global sym(vp8_intra_pred_y_tm_%1)
+global sym(vp8_intra_pred_y_tm_%1) PRIVATE
 sym(vp8_intra_pred_y_tm_%1):
     push        rbp
     mov         rbp, rsp
@@ -972,7 +972,7 @@
 ;    unsigned char *left,
 ;    int left_stride
 ;    )
-global sym(vp8_intra_pred_y_ve_sse2)
+global sym(vp8_intra_pred_y_ve_sse2) PRIVATE
 sym(vp8_intra_pred_y_ve_sse2):
     push        rbp
     mov         rbp, rsp
@@ -1020,7 +1020,7 @@
 ;    unsigned char *left,
 ;    int left_stride,
 ;    )
-global sym(vp8_intra_pred_y_ho_sse2)
+global sym(vp8_intra_pred_y_ho_sse2) PRIVATE
 sym(vp8_intra_pred_y_ho_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/common/x86/sad_mmx.asm b/vp8/common/x86/sad_mmx.asm
index 407b399..592112f 100644
--- a/vp8/common/x86/sad_mmx.asm
+++ b/vp8/common/x86/sad_mmx.asm
@@ -11,11 +11,11 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-global sym(vp8_sad16x16_mmx)
-global sym(vp8_sad8x16_mmx)
-global sym(vp8_sad8x8_mmx)
-global sym(vp8_sad4x4_mmx)
-global sym(vp8_sad16x8_mmx)
+global sym(vp8_sad16x16_mmx) PRIVATE
+global sym(vp8_sad8x16_mmx) PRIVATE
+global sym(vp8_sad8x8_mmx) PRIVATE
+global sym(vp8_sad4x4_mmx) PRIVATE
+global sym(vp8_sad16x8_mmx) PRIVATE
 
 ;unsigned int vp8_sad16x16_mmx(
 ;    unsigned char *src_ptr,
diff --git a/vp8/common/x86/sad_sse2.asm b/vp8/common/x86/sad_sse2.asm
index 0b01d7b..290e676 100644
--- a/vp8/common/x86/sad_sse2.asm
+++ b/vp8/common/x86/sad_sse2.asm
@@ -16,7 +16,7 @@
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride)
-global sym(vp8_sad16x16_wmt)
+global sym(vp8_sad16x16_wmt) PRIVATE
 sym(vp8_sad16x16_wmt):
     push        rbp
     mov         rbp, rsp
@@ -90,7 +90,7 @@
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  max_sad)
-global sym(vp8_sad8x16_wmt)
+global sym(vp8_sad8x16_wmt) PRIVATE
 sym(vp8_sad8x16_wmt):
     push        rbp
     mov         rbp, rsp
@@ -153,7 +153,7 @@
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride)
-global sym(vp8_sad8x8_wmt)
+global sym(vp8_sad8x8_wmt) PRIVATE
 sym(vp8_sad8x8_wmt):
     push        rbp
     mov         rbp, rsp
@@ -206,7 +206,7 @@
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride)
-global sym(vp8_sad4x4_wmt)
+global sym(vp8_sad4x4_wmt) PRIVATE
 sym(vp8_sad4x4_wmt):
     push        rbp
     mov         rbp, rsp
@@ -261,7 +261,7 @@
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride)
-global sym(vp8_sad16x8_wmt)
+global sym(vp8_sad16x8_wmt) PRIVATE
 sym(vp8_sad16x8_wmt):
     push        rbp
     mov         rbp, rsp
@@ -335,7 +335,7 @@
 ;    unsigned char *dst_ptr,
 ;    int  dst_stride,
 ;    int height);
-global sym(vp8_copy32xn_sse2)
+global sym(vp8_copy32xn_sse2) PRIVATE
 sym(vp8_copy32xn_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/common/x86/sad_sse3.asm b/vp8/common/x86/sad_sse3.asm
index c2af3c8..f90a589 100644
--- a/vp8/common/x86/sad_sse3.asm
+++ b/vp8/common/x86/sad_sse3.asm
@@ -380,7 +380,7 @@
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp8_sad16x16x3_sse3)
+global sym(vp8_sad16x16x3_sse3) PRIVATE
 sym(vp8_sad16x16x3_sse3):
 
     STACK_FRAME_CREATE_X3
@@ -422,7 +422,7 @@
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp8_sad16x8x3_sse3)
+global sym(vp8_sad16x8x3_sse3) PRIVATE
 sym(vp8_sad16x8x3_sse3):
 
     STACK_FRAME_CREATE_X3
@@ -460,7 +460,7 @@
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp8_sad8x16x3_sse3)
+global sym(vp8_sad8x16x3_sse3) PRIVATE
 sym(vp8_sad8x16x3_sse3):
 
     STACK_FRAME_CREATE_X3
@@ -489,7 +489,7 @@
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp8_sad8x8x3_sse3)
+global sym(vp8_sad8x8x3_sse3) PRIVATE
 sym(vp8_sad8x8x3_sse3):
 
     STACK_FRAME_CREATE_X3
@@ -514,7 +514,7 @@
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp8_sad4x4x3_sse3)
+global sym(vp8_sad4x4x3_sse3) PRIVATE
 sym(vp8_sad4x4x3_sse3):
 
     STACK_FRAME_CREATE_X3
@@ -589,7 +589,7 @@
 ;    int  ref_stride,
 ;    int  max_sad)
 ;%define lddqu movdqu
-global sym(vp8_sad16x16_sse3)
+global sym(vp8_sad16x16_sse3) PRIVATE
 sym(vp8_sad16x16_sse3):
 
     STACK_FRAME_CREATE_X3
@@ -642,7 +642,7 @@
 ;    unsigned char *dst_ptr,
 ;    int  dst_stride,
 ;    int height);
-global sym(vp8_copy32xn_sse3)
+global sym(vp8_copy32xn_sse3) PRIVATE
 sym(vp8_copy32xn_sse3):
 
     STACK_FRAME_CREATE_X3
@@ -703,7 +703,7 @@
 ;    unsigned char *ref_ptr_base,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp8_sad16x16x4d_sse3)
+global sym(vp8_sad16x16x4d_sse3) PRIVATE
 sym(vp8_sad16x16x4d_sse3):
 
     STACK_FRAME_CREATE_X4
@@ -754,7 +754,7 @@
 ;    unsigned char *ref_ptr_base,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp8_sad16x8x4d_sse3)
+global sym(vp8_sad16x8x4d_sse3) PRIVATE
 sym(vp8_sad16x8x4d_sse3):
 
     STACK_FRAME_CREATE_X4
@@ -801,7 +801,7 @@
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp8_sad8x16x4d_sse3)
+global sym(vp8_sad8x16x4d_sse3) PRIVATE
 sym(vp8_sad8x16x4d_sse3):
 
     STACK_FRAME_CREATE_X4
@@ -834,7 +834,7 @@
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp8_sad8x8x4d_sse3)
+global sym(vp8_sad8x8x4d_sse3) PRIVATE
 sym(vp8_sad8x8x4d_sse3):
 
     STACK_FRAME_CREATE_X4
@@ -863,7 +863,7 @@
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp8_sad4x4x4d_sse3)
+global sym(vp8_sad4x4x4d_sse3) PRIVATE
 sym(vp8_sad4x4x4d_sse3):
 
     STACK_FRAME_CREATE_X4
diff --git a/vp8/common/x86/sad_sse4.asm b/vp8/common/x86/sad_sse4.asm
index 03ecec4..f7fccd7 100644
--- a/vp8/common/x86/sad_sse4.asm
+++ b/vp8/common/x86/sad_sse4.asm
@@ -161,7 +161,7 @@
 ;    const unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    unsigned short *sad_array);
-global sym(vp8_sad16x16x8_sse4)
+global sym(vp8_sad16x16x8_sse4) PRIVATE
 sym(vp8_sad16x16x8_sse4):
     push        rbp
     mov         rbp, rsp
@@ -203,7 +203,7 @@
 ;    int  ref_stride,
 ;    unsigned short *sad_array
 ;);
-global sym(vp8_sad16x8x8_sse4)
+global sym(vp8_sad16x8x8_sse4) PRIVATE
 sym(vp8_sad16x8x8_sse4):
     push        rbp
     mov         rbp, rsp
@@ -241,7 +241,7 @@
 ;    int  ref_stride,
 ;    unsigned short *sad_array
 ;);
-global sym(vp8_sad8x8x8_sse4)
+global sym(vp8_sad8x8x8_sse4) PRIVATE
 sym(vp8_sad8x8x8_sse4):
     push        rbp
     mov         rbp, rsp
@@ -279,7 +279,7 @@
 ;    int  ref_stride,
 ;    unsigned short *sad_array
 ;);
-global sym(vp8_sad8x16x8_sse4)
+global sym(vp8_sad8x16x8_sse4) PRIVATE
 sym(vp8_sad8x16x8_sse4):
     push        rbp
     mov         rbp, rsp
@@ -320,7 +320,7 @@
 ;    int  ref_stride,
 ;    unsigned short *sad_array
 ;);
-global sym(vp8_sad4x4x8_sse4)
+global sym(vp8_sad4x4x8_sse4) PRIVATE
 sym(vp8_sad4x4x8_sse4):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/common/x86/sad_ssse3.asm b/vp8/common/x86/sad_ssse3.asm
index 95b6c89..278fc06 100644
--- a/vp8/common/x86/sad_ssse3.asm
+++ b/vp8/common/x86/sad_ssse3.asm
@@ -152,7 +152,7 @@
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp8_sad16x16x3_ssse3)
+global sym(vp8_sad16x16x3_ssse3) PRIVATE
 sym(vp8_sad16x16x3_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -265,7 +265,7 @@
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp8_sad16x8x3_ssse3)
+global sym(vp8_sad16x8x3_ssse3) PRIVATE
 sym(vp8_sad16x8x3_ssse3):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/common/x86/subpixel_mmx.asm b/vp8/common/x86/subpixel_mmx.asm
index 5528fd0..47dd452 100644
--- a/vp8/common/x86/subpixel_mmx.asm
+++ b/vp8/common/x86/subpixel_mmx.asm
@@ -28,7 +28,7 @@
 ;    unsigned int    output_width,
 ;    short           * vp8_filter
 ;)
-global sym(vp8_filter_block1d_h6_mmx)
+global sym(vp8_filter_block1d_h6_mmx) PRIVATE
 sym(vp8_filter_block1d_h6_mmx):
     push        rbp
     mov         rbp, rsp
@@ -125,7 +125,7 @@
 ;   unsigned int output_width,
 ;   short * vp8_filter
 ;)
-global sym(vp8_filter_block1dc_v6_mmx)
+global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
 sym(vp8_filter_block1dc_v6_mmx):
     push        rbp
     mov         rbp, rsp
@@ -213,7 +213,7 @@
 ;   unsigned char *dst_ptr,
 ;    int dst_pitch
 ;)
-global sym(vp8_bilinear_predict8x8_mmx)
+global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
 sym(vp8_bilinear_predict8x8_mmx):
     push        rbp
     mov         rbp, rsp
@@ -370,7 +370,7 @@
 ;    unsigned char *dst_ptr,
 ;    int dst_pitch
 ;)
-global sym(vp8_bilinear_predict8x4_mmx)
+global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
 sym(vp8_bilinear_predict8x4_mmx):
     push        rbp
     mov         rbp, rsp
@@ -525,7 +525,7 @@
 ;    unsigned char *dst_ptr,
 ;    int dst_pitch
 ;)
-global sym(vp8_bilinear_predict4x4_mmx)
+global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
 sym(vp8_bilinear_predict4x4_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm
index cb550af..69f8d10 100644
--- a/vp8/common/x86/subpixel_sse2.asm
+++ b/vp8/common/x86/subpixel_sse2.asm
@@ -33,7 +33,7 @@
 ;    unsigned int    output_width,
 ;    short           *vp8_filter
 ;)
-global sym(vp8_filter_block1d8_h6_sse2)
+global sym(vp8_filter_block1d8_h6_sse2) PRIVATE
 sym(vp8_filter_block1d8_h6_sse2):
     push        rbp
     mov         rbp, rsp
@@ -153,7 +153,7 @@
 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
 ; rows each iteration to take advantage of the 128 bits operations.
 ;*************************************************************************************/
-global sym(vp8_filter_block1d16_h6_sse2)
+global sym(vp8_filter_block1d16_h6_sse2) PRIVATE
 sym(vp8_filter_block1d16_h6_sse2):
     push        rbp
     mov         rbp, rsp
@@ -329,7 +329,7 @@
 ; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
 ; input pixel array has output_height rows.
 ;*************************************************************************************/
-global sym(vp8_filter_block1d8_v6_sse2)
+global sym(vp8_filter_block1d8_v6_sse2) PRIVATE
 sym(vp8_filter_block1d8_v6_sse2):
     push        rbp
     mov         rbp, rsp
@@ -424,7 +424,7 @@
 ; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
 ; input pixel array has output_height rows.
 ;*************************************************************************************/
-global sym(vp8_filter_block1d16_v6_sse2)
+global sym(vp8_filter_block1d16_v6_sse2) PRIVATE
 sym(vp8_filter_block1d16_v6_sse2):
     push        rbp
     mov         rbp, rsp
@@ -534,7 +534,7 @@
 ;    const short    *vp8_filter
 ;)
 ; First-pass filter only when yoffset==0
-global sym(vp8_filter_block1d8_h6_only_sse2)
+global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE
 sym(vp8_filter_block1d8_h6_only_sse2):
     push        rbp
     mov         rbp, rsp
@@ -647,7 +647,7 @@
 ;    const short    *vp8_filter
 ;)
 ; First-pass filter only when yoffset==0
-global sym(vp8_filter_block1d16_h6_only_sse2)
+global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE
 sym(vp8_filter_block1d16_h6_only_sse2):
     push        rbp
     mov         rbp, rsp
@@ -812,7 +812,7 @@
 ;    const short    *vp8_filter
 ;)
 ; Second-pass filter only when xoffset==0
-global sym(vp8_filter_block1d8_v6_only_sse2)
+global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE
 sym(vp8_filter_block1d8_v6_only_sse2):
     push        rbp
     mov         rbp, rsp
@@ -904,7 +904,7 @@
 ;    unsigned int    output_height,
 ;    unsigned int    output_width
 ;)
-global sym(vp8_unpack_block1d16_h6_sse2)
+global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE
 sym(vp8_unpack_block1d16_h6_sse2):
     push        rbp
     mov         rbp, rsp
@@ -963,7 +963,7 @@
 ;    int dst_pitch
 ;)
 extern sym(vp8_bilinear_filters_x86_8)
-global sym(vp8_bilinear_predict16x16_sse2)
+global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
 sym(vp8_bilinear_predict16x16_sse2):
     push        rbp
     mov         rbp, rsp
@@ -1231,7 +1231,7 @@
 ;    unsigned char *dst_ptr,
 ;    int dst_pitch
 ;)
-global sym(vp8_bilinear_predict8x8_sse2)
+global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
 sym(vp8_bilinear_predict8x8_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm
index 6bca82b..13bcaf6 100644
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@@ -34,7 +34,7 @@
 ;    unsigned int    output_height,
 ;    unsigned int    vp8_filter_index
 ;)
-global sym(vp8_filter_block1d8_h6_ssse3)
+global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE
 sym(vp8_filter_block1d8_h6_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -177,7 +177,7 @@
 ;    unsigned int    output_height,
 ;    unsigned int    vp8_filter_index
 ;)
-global sym(vp8_filter_block1d16_h6_ssse3)
+global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE
 sym(vp8_filter_block1d16_h6_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -284,7 +284,7 @@
 ;    unsigned int    output_height,
 ;    unsigned int    vp8_filter_index
 ;)
-global sym(vp8_filter_block1d4_h6_ssse3)
+global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE
 sym(vp8_filter_block1d4_h6_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -413,7 +413,7 @@
 ;    unsigned int   output_height,
 ;    unsigned int   vp8_filter_index
 ;)
-global sym(vp8_filter_block1d16_v6_ssse3)
+global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE
 sym(vp8_filter_block1d16_v6_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -601,7 +601,7 @@
 ;    unsigned int   output_height,
 ;    unsigned int   vp8_filter_index
 ;)
-global sym(vp8_filter_block1d8_v6_ssse3)
+global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE
 sym(vp8_filter_block1d8_v6_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -741,7 +741,7 @@
 ;    unsigned int   output_height,
 ;    unsigned int   vp8_filter_index
 ;)
-global sym(vp8_filter_block1d4_v6_ssse3)
+global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE
 sym(vp8_filter_block1d4_v6_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -880,7 +880,7 @@
 ;    unsigned char *dst_ptr,
 ;    int dst_pitch
 ;)
-global sym(vp8_bilinear_predict16x16_ssse3)
+global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE
 sym(vp8_bilinear_predict16x16_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -1143,7 +1143,7 @@
 ;    unsigned char *dst_ptr,
 ;    int dst_pitch
 ;)
-global sym(vp8_bilinear_predict8x8_ssse3)
+global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE
 sym(vp8_bilinear_predict8x8_ssse3):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/common/x86/variance_impl_mmx.asm b/vp8/common/x86/variance_impl_mmx.asm
index 2be8bbe..d9120d0 100644
--- a/vp8/common/x86/variance_impl_mmx.asm
+++ b/vp8/common/x86/variance_impl_mmx.asm
@@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 ;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
-global sym(vp8_get_mb_ss_mmx)
+global sym(vp8_get_mb_ss_mmx) PRIVATE
 sym(vp8_get_mb_ss_mmx):
     push        rbp
     mov         rbp, rsp
@@ -72,7 +72,7 @@
 ;    unsigned int *SSE,
 ;    int *Sum
 ;)
-global sym(vp8_get8x8var_mmx)
+global sym(vp8_get8x8var_mmx) PRIVATE
 sym(vp8_get8x8var_mmx):
     push        rbp
     mov         rbp, rsp
@@ -320,7 +320,7 @@
 ;    unsigned int *SSE,
 ;    int *Sum
 ;)
-global sym(vp8_get4x4var_mmx)
+global sym(vp8_get4x4var_mmx) PRIVATE
 sym(vp8_get4x4var_mmx):
     push        rbp
     mov         rbp, rsp
@@ -433,7 +433,7 @@
 ;    unsigned char *ref_ptr,
 ;    int  recon_stride
 ;)
-global sym(vp8_get4x4sse_cs_mmx)
+global sym(vp8_get4x4sse_cs_mmx) PRIVATE
 sym(vp8_get4x4sse_cs_mmx):
     push        rbp
     mov         rbp, rsp
@@ -522,7 +522,7 @@
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_filter_block2d_bil4x4_var_mmx)
+global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE
 sym(vp8_filter_block2d_bil4x4_var_mmx):
     push        rbp
     mov         rbp, rsp
@@ -667,7 +667,7 @@
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_filter_block2d_bil_var_mmx)
+global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE
 sym(vp8_filter_block2d_bil_var_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/common/x86/variance_impl_sse2.asm b/vp8/common/x86/variance_impl_sse2.asm
index 7629220..761433c 100644
--- a/vp8/common/x86/variance_impl_sse2.asm
+++ b/vp8/common/x86/variance_impl_sse2.asm
@@ -17,7 +17,7 @@
 ;(
 ;    short *src_ptr
 ;)
-global sym(vp8_get_mb_ss_sse2)
+global sym(vp8_get_mb_ss_sse2) PRIVATE
 sym(vp8_get_mb_ss_sse2):
     push        rbp
     mov         rbp, rsp
@@ -80,7 +80,7 @@
 ;    unsigned int    *  SSE,
 ;    int             *  Sum
 ;)
-global sym(vp8_get16x16var_sse2)
+global sym(vp8_get16x16var_sse2) PRIVATE
 sym(vp8_get16x16var_sse2):
     push        rbp
     mov         rbp, rsp
@@ -224,7 +224,7 @@
 ;    unsigned int    *  SSE,
 ;    int             *  Sum
 ;)
-global sym(vp8_get8x8var_sse2)
+global sym(vp8_get8x8var_sse2) PRIVATE
 sym(vp8_get8x8var_sse2):
     push        rbp
     mov         rbp, rsp
@@ -413,7 +413,7 @@
 ;    unsigned int *sumsquared;;
 ;
 ;)
-global sym(vp8_filter_block2d_bil_var_sse2)
+global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE
 sym(vp8_filter_block2d_bil_var_sse2):
     push        rbp
     mov         rbp, rsp
@@ -690,7 +690,7 @@
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_half_horiz_vert_variance8x_h_sse2)
+global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE
 sym(vp8_half_horiz_vert_variance8x_h_sse2):
     push        rbp
     mov         rbp, rsp
@@ -812,7 +812,7 @@
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_half_horiz_vert_variance16x_h_sse2)
+global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE
 sym(vp8_half_horiz_vert_variance16x_h_sse2):
     push        rbp
     mov         rbp, rsp
@@ -928,7 +928,7 @@
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_half_vert_variance8x_h_sse2)
+global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE
 sym(vp8_half_vert_variance8x_h_sse2):
     push        rbp
     mov         rbp, rsp
@@ -1035,7 +1035,7 @@
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_half_vert_variance16x_h_sse2)
+global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE
 sym(vp8_half_vert_variance16x_h_sse2):
     push        rbp
     mov         rbp, rsp
@@ -1143,7 +1143,7 @@
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_half_horiz_variance8x_h_sse2)
+global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE
 sym(vp8_half_horiz_variance8x_h_sse2):
     push        rbp
     mov         rbp, rsp
@@ -1248,7 +1248,7 @@
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_half_horiz_variance16x_h_sse2)
+global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE
 sym(vp8_half_horiz_variance16x_h_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/common/x86/variance_impl_ssse3.asm b/vp8/common/x86/variance_impl_ssse3.asm
index 97e8b0e..686b4a9 100644
--- a/vp8/common/x86/variance_impl_ssse3.asm
+++ b/vp8/common/x86/variance_impl_ssse3.asm
@@ -29,7 +29,7 @@
 ;)
 ;Note: The filter coefficient at offset=0 is 128. Since the second register
 ;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
-global sym(vp8_filter_block2d_bil_var_ssse3)
+global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE
 sym(vp8_filter_block2d_bil_var_ssse3):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index 6165d04..a98fd50 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -119,6 +119,16 @@
     int optimize;
     int q_index;
 
+#if CONFIG_TEMPORAL_DENOISING
+    MB_PREDICTION_MODE best_sse_inter_mode;
+    int_mv best_sse_mv;
+    MV_REFERENCE_FRAME best_reference_frame;
+    MV_REFERENCE_FRAME best_zeromv_reference_frame;
+    unsigned char need_to_clamp_best_mvs;
+#endif
+
+
+
     void (*short_fdct4x4)(short *input, short *output, int pitch);
     void (*short_fdct8x4)(short *input, short *output, int pitch);
     void (*short_walsh4x4)(short *input, short *output, int pitch);
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index 09ed9dd..f392396 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -15,198 +15,319 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_rtcd.h"
 
-static const unsigned int NOISE_MOTION_THRESHOLD = 20*20;
-static const unsigned int NOISE_DIFF2_THRESHOLD = 75;
+static const unsigned int NOISE_MOTION_THRESHOLD = 25 * 25;
 // SSE_DIFF_THRESHOLD is selected as ~95% confidence assuming var(noise) ~= 100.
-static const unsigned int SSE_DIFF_THRESHOLD = 16*16*20;
-static const unsigned int SSE_THRESHOLD = 16*16*40;
+static const unsigned int SSE_DIFF_THRESHOLD = 16 * 16 * 20;
+static const unsigned int SSE_THRESHOLD = 16 * 16 * 40;
 
-static uint8_t blend(uint8_t state, uint8_t sample, uint8_t factor_q8)
-{
-  return (uint8_t)(
-      (((uint16_t)factor_q8 * ((uint16_t)state) +  // Q8
-        (uint16_t)(256 - factor_q8) * ((uint16_t)sample)) + 128)  // Q8
-      >> 8);
-}
+// The filtering coefficients used for denoizing are adjusted for static
+// blocks, or blocks with very small motion vectors. This is done through
+// the motion magnitude parameter.
+//
+// There are currently 2048 possible mapping from absolute difference to
+// filter coefficient depending on the motion magnitude. Each mapping is
+// in a LUT table. All these tables are staticly allocated but they are only
+// filled on their first use.
+//
+// Each entry is a pair of 16b values, the coefficient and its complement
+// to 256. Each of these value should only be 8b but they are 16b wide to
+// avoid slow partial register manipulations.
+enum {num_motion_magnitude_adjustments = 2048};
 
-static unsigned int denoiser_motion_compensate(YV12_BUFFER_CONFIG* src,
-                                               YV12_BUFFER_CONFIG* dst,
-                                               MACROBLOCK* x,
-                                               unsigned int best_sse,
-                                               unsigned int zero_mv_sse,
-                                               int recon_yoffset,
-                                               int recon_uvoffset)
-{
-  MACROBLOCKD filter_xd = x->e_mbd;
-  int mv_col;
-  int mv_row;
-  int sse_diff = zero_mv_sse - best_sse;
-  // Compensate the running average.
-  filter_xd.pre.y_buffer = src->y_buffer + recon_yoffset;
-  filter_xd.pre.u_buffer = src->u_buffer + recon_uvoffset;
-  filter_xd.pre.v_buffer = src->v_buffer + recon_uvoffset;
-  // Write the compensated running average to the destination buffer.
-  filter_xd.dst.y_buffer = dst->y_buffer + recon_yoffset;
-  filter_xd.dst.u_buffer = dst->u_buffer + recon_uvoffset;
-  filter_xd.dst.v_buffer = dst->v_buffer + recon_uvoffset;
-  // Use the best MV for the compensation.
-  filter_xd.mode_info_context->mbmi.ref_frame = LAST_FRAME;
-  filter_xd.mode_info_context->mbmi.mode = filter_xd.best_sse_inter_mode;
-  filter_xd.mode_info_context->mbmi.mv = filter_xd.best_sse_mv;
-  filter_xd.mode_info_context->mbmi.need_to_clamp_mvs =
-      filter_xd.need_to_clamp_best_mvs;
-  mv_col = filter_xd.best_sse_mv.as_mv.col;
-  mv_row = filter_xd.best_sse_mv.as_mv.row;
-  if (filter_xd.mode_info_context->mbmi.mode <= B_PRED ||
-      (mv_row*mv_row + mv_col*mv_col <= NOISE_MOTION_THRESHOLD &&
-       sse_diff < SSE_DIFF_THRESHOLD))
-  {
-    // Handle intra blocks as referring to last frame with zero motion and
-    // let the absolute pixel difference affect the filter factor.
-    // Also consider small amount of motion as being random walk due to noise,
-    // if it doesn't mean that we get a much bigger error.
-    // Note that any changes to the mode info only affects the denoising.
-    filter_xd.mode_info_context->mbmi.ref_frame = LAST_FRAME;
-    filter_xd.mode_info_context->mbmi.mode = ZEROMV;
-    filter_xd.mode_info_context->mbmi.mv.as_int = 0;
-    x->e_mbd.best_sse_inter_mode = ZEROMV;
-    x->e_mbd.best_sse_mv.as_int = 0;
-    best_sse = zero_mv_sse;
-  }
-  if (!x->skip)
-  {
-    vp8_build_inter_predictors_mb(&filter_xd);
-  }
-  else
-  {
-    vp8_build_inter16x16_predictors_mb(&filter_xd,
-                                       filter_xd.dst.y_buffer,
-                                       filter_xd.dst.u_buffer,
-                                       filter_xd.dst.v_buffer,
-                                       filter_xd.dst.y_stride,
-                                       filter_xd.dst.uv_stride);
-  }
-  return best_sse;
-}
+static union coeff_pair filter_coeff_LUT[num_motion_magnitude_adjustments][256];
+static uint8_t filter_coeff_LUT_initialized[num_motion_magnitude_adjustments] =
+    { 0 };
 
-static void denoiser_filter(YV12_BUFFER_CONFIG* mc_running_avg,
-                            YV12_BUFFER_CONFIG* running_avg,
-                            MACROBLOCK* signal,
-                            unsigned int motion_magnitude2,
-                            int y_offset,
-                            int uv_offset)
+
+union coeff_pair *vp8_get_filter_coeff_LUT(unsigned int motion_magnitude)
 {
-  unsigned char* sig = signal->thismb;
-  int sig_stride = 16;
-  unsigned char* mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
-  int mc_avg_y_stride = mc_running_avg->y_stride;
-  unsigned char* running_avg_y = running_avg->y_buffer + y_offset;
-  int avg_y_stride = running_avg->y_stride;
-  int r, c;
-  for (r = 0; r < 16; r++)
-  {
-    for (c = 0; c < 16; c++)
+    union coeff_pair *LUT;
+    unsigned int motion_magnitude_adjustment = motion_magnitude >> 3;
+
+    if (motion_magnitude_adjustment >= num_motion_magnitude_adjustments)
     {
-      int diff;
-      int absdiff = 0;
-      unsigned int filter_coefficient;
-      absdiff = sig[c] - mc_running_avg_y[c];
-      absdiff = absdiff > 0 ? absdiff : -absdiff;
-      assert(absdiff >= 0 && absdiff < 256);
-      filter_coefficient = (255 << 8) / (256 + ((absdiff * 330) >> 3));
-      // Allow some additional filtering of static blocks, or blocks with very
-      // small motion vectors.
-      filter_coefficient += filter_coefficient / (3 + (motion_magnitude2 >> 3));
-      filter_coefficient = filter_coefficient > 255 ? 255 : filter_coefficient;
-
-      running_avg_y[c] = blend(mc_running_avg_y[c], sig[c], filter_coefficient);
-      diff = sig[c] - running_avg_y[c];
-
-      if (diff * diff < NOISE_DIFF2_THRESHOLD)
-      {
-        // Replace with mean to suppress the noise.
-        sig[c] = running_avg_y[c];
-      }
-      else
-      {
-        // Replace the filter state with the signal since the change in this
-        // pixel isn't classified as noise.
-        running_avg_y[c] = sig[c];
-      }
+        motion_magnitude_adjustment = num_motion_magnitude_adjustments - 1;
     }
-    sig += sig_stride;
-    mc_running_avg_y += mc_avg_y_stride;
-    running_avg_y += avg_y_stride;
-  }
+
+    LUT = filter_coeff_LUT[motion_magnitude_adjustment];
+
+    if (!filter_coeff_LUT_initialized[motion_magnitude_adjustment])
+    {
+        int absdiff;
+
+        for (absdiff = 0; absdiff < 256; ++absdiff)
+        {
+            unsigned int filter_coefficient;
+            filter_coefficient = (255 << 8) / (256 + ((absdiff * 330) >> 3));
+            filter_coefficient += filter_coefficient /
+                                  (3 + motion_magnitude_adjustment);
+
+            if (filter_coefficient > 255)
+            {
+                filter_coefficient = 255;
+            }
+
+            LUT[absdiff].as_short[0] = filter_coefficient ;
+            LUT[absdiff].as_short[1] = 256 - filter_coefficient;
+        }
+
+        filter_coeff_LUT_initialized[motion_magnitude_adjustment] = 1;
+    }
+
+    return LUT;
 }
 
+
+
+int vp8_denoiser_filter_c(YV12_BUFFER_CONFIG *mc_running_avg,
+                          YV12_BUFFER_CONFIG *running_avg,
+                          MACROBLOCK *signal,
+                          unsigned int motion_magnitude,
+                          int y_offset,
+                          int uv_offset)
+{
+    unsigned char filtered_buf[16*16];
+    unsigned char *filtered = filtered_buf;
+    unsigned char *sig = signal->thismb;
+    int sig_stride = 16;
+    unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
+    int mc_avg_y_stride = mc_running_avg->y_stride;
+    unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
+    int avg_y_stride = running_avg->y_stride;
+    const union coeff_pair *LUT = vp8_get_filter_coeff_LUT(motion_magnitude);
+    int r, c;
+    int sum_diff = 0;
+
+    for (r = 0; r < 16; ++r)
+    {
+        // Calculate absolute differences
+        unsigned char abs_diff[16];
+
+        union coeff_pair filter_coefficient[16];
+
+        for (c = 0; c < 16; ++c)
+        {
+            int absdiff = sig[c] - mc_running_avg_y[c];
+            absdiff = absdiff > 0 ? absdiff : -absdiff;
+            abs_diff[c] = absdiff;
+        }
+
+        // Use LUT to get filter coefficients (two 16b value; f and 256-f)
+        for (c = 0; c < 16; ++c)
+        {
+            filter_coefficient[c] = LUT[abs_diff[c]];
+        }
+
+        // Filtering...
+        for (c = 0; c < 16; ++c)
+        {
+            const uint16_t state = (uint16_t)(mc_running_avg_y[c]);
+            const uint16_t sample = (uint16_t)(sig[c]);
+
+            running_avg_y[c] = (filter_coefficient[c].as_short[0] * state +
+                    filter_coefficient[c].as_short[1] * sample + 128) >> 8;
+        }
+
+        // Depending on the magnitude of the difference between the signal and
+        // filtered version, either replace the signal by the filtered one or
+        // update the filter state with the signal when the change in a pixel
+        // isn't classified as noise.
+        for (c = 0; c < 16; ++c)
+        {
+            const int diff = sig[c] - running_avg_y[c];
+            sum_diff += diff;
+
+            if (diff * diff < NOISE_DIFF2_THRESHOLD)
+            {
+                filtered[c] = running_avg_y[c];
+            }
+            else
+            {
+                filtered[c] = sig[c];
+                running_avg_y[c] = sig[c];
+            }
+        }
+
+        // Update pointers for next iteration.
+        sig += sig_stride;
+        filtered += 16;
+        mc_running_avg_y += mc_avg_y_stride;
+        running_avg_y += avg_y_stride;
+    }
+    if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
+    {
+        return COPY_BLOCK;
+    }
+    vp8_copy_mem16x16(filtered_buf, 16, signal->thismb, sig_stride);
+    return FILTER_BLOCK;
+}
+
+
 int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height)
 {
-  assert(denoiser);
-  denoiser->yv12_running_avg.flags = 0;
-  if (vp8_yv12_alloc_frame_buffer(&(denoiser->yv12_running_avg), width,
-                                  height, VP8BORDERINPIXELS) < 0)
-  {
-      vp8_denoiser_free(denoiser);
-      return 1;
-  }
-  denoiser->yv12_mc_running_avg.flags = 0;
-  if (vp8_yv12_alloc_frame_buffer(&(denoiser->yv12_mc_running_avg), width,
-                                  height, VP8BORDERINPIXELS) < 0)
-  {
-      vp8_denoiser_free(denoiser);
-      return 1;
-  }
-  vpx_memset(denoiser->yv12_running_avg.buffer_alloc, 0,
-             denoiser->yv12_running_avg.frame_size);
-  vpx_memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0,
-             denoiser->yv12_mc_running_avg.frame_size);
-  return 0;
+    int i;
+    assert(denoiser);
+
+    /* don't need one for intra start at 1 */
+    for (i = 1; i < MAX_REF_FRAMES; i++)
+    {
+        denoiser->yv12_running_avg[i].flags = 0;
+
+        if (vp8_yv12_alloc_frame_buffer(&(denoiser->yv12_running_avg[i]), width,
+                                        height, VP8BORDERINPIXELS)
+            < 0)
+        {
+            vp8_denoiser_free(denoiser);
+            return 1;
+        }
+        vpx_memset(denoiser->yv12_running_avg[i].buffer_alloc, 0,
+                   denoiser->yv12_running_avg[i].frame_size);
+
+    }
+    denoiser->yv12_mc_running_avg.flags = 0;
+
+    if (vp8_yv12_alloc_frame_buffer(&(denoiser->yv12_mc_running_avg), width,
+                                   height, VP8BORDERINPIXELS) < 0)
+    {
+        vp8_denoiser_free(denoiser);
+        return 1;
+    }
+
+    vpx_memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0,
+               denoiser->yv12_mc_running_avg.frame_size);
+    return 0;
 }
 
 void vp8_denoiser_free(VP8_DENOISER *denoiser)
 {
-  assert(denoiser);
-  vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_running_avg);
-  vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_mc_running_avg);
+    int i;
+    assert(denoiser);
+
+    /* we don't have one for intra ref frame */
+    for (i = 1; i < MAX_REF_FRAMES ; i++)
+    {
+        vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_running_avg[i]);
+    }
+    vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_mc_running_avg);
 }
 
+
 void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
                              MACROBLOCK *x,
                              unsigned int best_sse,
                              unsigned int zero_mv_sse,
                              int recon_yoffset,
-                             int recon_uvoffset) {
-  int mv_row;
-  int mv_col;
-  unsigned int motion_magnitude2;
-  // Motion compensate the running average.
-  best_sse = denoiser_motion_compensate(&denoiser->yv12_running_avg,
-                                        &denoiser->yv12_mc_running_avg,
-                                        x,
-                                        best_sse,
-                                        zero_mv_sse,
-                                        recon_yoffset,
-                                        recon_uvoffset);
+                             int recon_uvoffset)
+{
+    int mv_row;
+    int mv_col;
+    unsigned int motion_magnitude2;
 
-  mv_row = x->e_mbd.best_sse_mv.as_mv.row;
-  mv_col = x->e_mbd.best_sse_mv.as_mv.col;
-  motion_magnitude2 = mv_row*mv_row + mv_col*mv_col;
-  if (best_sse > SSE_THRESHOLD ||
-      motion_magnitude2 > 8 * NOISE_MOTION_THRESHOLD)
-  {
-    // No filtering of this block since it differs too much from the predictor,
-    // or the motion vector magnitude is considered too big.
-    vp8_copy_mem16x16(x->thismb, 16,
-                      denoiser->yv12_running_avg.y_buffer + recon_yoffset,
-                      denoiser->yv12_running_avg.y_stride);
-    return;
-  }
-  // Filter.
-  denoiser_filter(&denoiser->yv12_mc_running_avg,
-                  &denoiser->yv12_running_avg,
-                  x,
-                  motion_magnitude2,
-                  recon_yoffset,
-                  recon_uvoffset);
+    MV_REFERENCE_FRAME frame = x->best_reference_frame;
+    MV_REFERENCE_FRAME zero_frame = x->best_zeromv_reference_frame;
+
+    enum vp8_denoiser_decision decision = FILTER_BLOCK;
+
+    // Motion compensate the running average.
+    if (zero_frame)
+    {
+        YV12_BUFFER_CONFIG *src = &denoiser->yv12_running_avg[frame];
+        YV12_BUFFER_CONFIG *dst = &denoiser->yv12_mc_running_avg;
+        YV12_BUFFER_CONFIG saved_pre,saved_dst;
+        MB_MODE_INFO saved_mbmi;
+        MACROBLOCKD *filter_xd = &x->e_mbd;
+        MB_MODE_INFO *mbmi = &filter_xd->mode_info_context->mbmi;
+        int mv_col;
+        int mv_row;
+        int sse_diff = zero_mv_sse - best_sse;
+
+        saved_mbmi = *mbmi;
+
+        // Use the best MV for the compensation.
+        mbmi->ref_frame = x->best_reference_frame;
+        mbmi->mode = x->best_sse_inter_mode;
+        mbmi->mv = x->best_sse_mv;
+        mbmi->need_to_clamp_mvs = x->need_to_clamp_best_mvs;
+        mv_col = x->best_sse_mv.as_mv.col;
+        mv_row = x->best_sse_mv.as_mv.row;
+
+        if (frame == INTRA_FRAME ||
+            (mv_row *mv_row + mv_col *mv_col <= NOISE_MOTION_THRESHOLD &&
+             sse_diff < SSE_DIFF_THRESHOLD))
+        {
+            // Handle intra blocks as referring to last frame with zero motion
+            // and let the absolute pixel difference affect the filter factor.
+            // Also consider small amount of motion as being random walk due to
+            // noise, if it doesn't mean that we get a much bigger error.
+            // Note that any changes to the mode info only affects the denoising.
+            mbmi->ref_frame =
+                    x->best_zeromv_reference_frame;
+
+            src = &denoiser->yv12_running_avg[zero_frame];
+
+            mbmi->mode = ZEROMV;
+            mbmi->mv.as_int = 0;
+            x->best_sse_inter_mode = ZEROMV;
+            x->best_sse_mv.as_int = 0;
+            best_sse = zero_mv_sse;
+        }
+
+        saved_pre = filter_xd->pre;
+        saved_dst = filter_xd->dst;
+
+        // Compensate the running average.
+        filter_xd->pre.y_buffer = src->y_buffer + recon_yoffset;
+        filter_xd->pre.u_buffer = src->u_buffer + recon_uvoffset;
+        filter_xd->pre.v_buffer = src->v_buffer + recon_uvoffset;
+        // Write the compensated running average to the destination buffer.
+        filter_xd->dst.y_buffer = dst->y_buffer + recon_yoffset;
+        filter_xd->dst.u_buffer = dst->u_buffer + recon_uvoffset;
+        filter_xd->dst.v_buffer = dst->v_buffer + recon_uvoffset;
+
+        if (!x->skip)
+        {
+            vp8_build_inter_predictors_mb(filter_xd);
+        }
+        else
+        {
+            vp8_build_inter16x16_predictors_mb(filter_xd,
+                                               filter_xd->dst.y_buffer,
+                                               filter_xd->dst.u_buffer,
+                                               filter_xd->dst.v_buffer,
+                                               filter_xd->dst.y_stride,
+                                               filter_xd->dst.uv_stride);
+        }
+        filter_xd->pre = saved_pre;
+        filter_xd->dst = saved_dst;
+        *mbmi = saved_mbmi;
+
+    }
+
+    mv_row = x->best_sse_mv.as_mv.row;
+    mv_col = x->best_sse_mv.as_mv.col;
+    motion_magnitude2 = mv_row * mv_row + mv_col * mv_col;
+    if (best_sse > SSE_THRESHOLD || motion_magnitude2
+           > 8 * NOISE_MOTION_THRESHOLD)
+    {
+        decision = COPY_BLOCK;
+    }
+
+    if (decision == FILTER_BLOCK)
+    {
+        // Filter.
+        decision = vp8_denoiser_filter(&denoiser->yv12_mc_running_avg,
+                                       &denoiser->yv12_running_avg[LAST_FRAME],
+                                       x,
+                                       motion_magnitude2,
+                                       recon_yoffset, recon_uvoffset);
+    }
+    if (decision == COPY_BLOCK)
+    {
+        // No filtering of this block; it differs too much from the predictor,
+        // or the motion vector magnitude is considered too big.
+        vp8_copy_mem16x16(
+                x->thismb, 16,
+                denoiser->yv12_running_avg[LAST_FRAME].y_buffer + recon_yoffset,
+                denoiser->yv12_running_avg[LAST_FRAME].y_stride);
+    }
 }
diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h
index 343531b..dc78e65 100644
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h
@@ -13,10 +13,19 @@
 
 #include "block.h"
 
+#define NOISE_DIFF2_THRESHOLD (75)
+#define SUM_DIFF_THRESHOLD (16 * 16 * 2)
+
+enum vp8_denoiser_decision
+{
+  COPY_BLOCK,
+  FILTER_BLOCK,
+};
+
 typedef struct vp8_denoiser
 {
-  YV12_BUFFER_CONFIG yv12_running_avg;
-  YV12_BUFFER_CONFIG yv12_mc_running_avg;
+    YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES];
+    YV12_BUFFER_CONFIG yv12_mc_running_avg;
 } VP8_DENOISER;
 
 int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height);
@@ -30,4 +39,12 @@
                              int recon_yoffset,
                              int recon_uvoffset);
 
+union coeff_pair
+{
+    uint32_t as_int;
+    uint16_t as_short[2];
+};
+
+union coeff_pair *vp8_get_filter_coeff_LUT(unsigned int motion_magnitude);
+
 #endif  // VP8_ENCODER_DENOISING_H_
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 8233873..4450ab2 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -1177,9 +1177,11 @@
 
 #if CONFIG_TEMPORAL_DENOISING
     // Reset the best sse mode/mv for each macroblock.
-    x->e_mbd.best_sse_inter_mode = 0;
-    x->e_mbd.best_sse_mv.as_int = 0;
-    x->e_mbd.need_to_clamp_best_mvs = 0;
+    x->best_reference_frame = INTRA_FRAME;
+    x->best_zeromv_reference_frame = INTRA_FRAME;
+    x->best_sse_inter_mode = 0;
+    x->best_sse_mv.as_int = 0;
+    x->need_to_clamp_best_mvs = 0;
 #endif
 
     if (cpi->sf.RD)
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 0f4bc1d..878cad4 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -3156,9 +3156,49 @@
 #if CONFIG_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity)
     {
-      vp8_yv12_extend_frame_borders(&cpi->denoiser.yv12_running_avg);
+
+
+        /* we shouldn't have to keep multiple copies as we know in advance which
+         * buffer we should start - for now to get something up and running
+         * I've chosen to copy the buffers
+         */
+        if (cm->frame_type == KEY_FRAME)
+        {
+            int i;
+            vp8_yv12_copy_frame(
+                    cpi->Source,
+                    &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
+
+            vp8_yv12_extend_frame_borders(
+                    &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
+
+            for (i = 2; i < MAX_REF_FRAMES - 1; i++)
+                vp8_yv12_copy_frame(
+                        cpi->Source,
+                        &cpi->denoiser.yv12_running_avg[i]);
+        }
+        else /* For non key frames */
+        {
+            vp8_yv12_extend_frame_borders(
+                    &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
+
+            if (cm->refresh_alt_ref_frame || cm->copy_buffer_to_arf)
+            {
+                vp8_yv12_copy_frame(
+                        &cpi->denoiser.yv12_running_avg[LAST_FRAME],
+                        &cpi->denoiser.yv12_running_avg[ALTREF_FRAME]);
+            }
+            if (cm->refresh_golden_frame || cm->copy_buffer_to_gf)
+            {
+                vp8_yv12_copy_frame(
+                        &cpi->denoiser.yv12_running_avg[LAST_FRAME],
+                        &cpi->denoiser.yv12_running_avg[GOLDEN_FRAME]);
+            }
+        }
+
     }
 #endif
+
 }
 
 static void encode_frame_to_data_rate
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index dafb645..7f81713 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -61,7 +61,7 @@
 }
 
 
-static int get_inter_mbpred_error(MACROBLOCK *mb,
+int vp8_get_inter_mbpred_error(MACROBLOCK *mb,
                                   const vp8_variance_fn_ptr_t *vfp,
                                   unsigned int *sse,
                                   int_mv this_mv)
@@ -486,7 +486,7 @@
 
     if((this_mode != NEWMV) ||
         !(cpi->sf.half_pixel_search) || cpi->common.full_pixel==1)
-        *distortion2 = get_inter_mbpred_error(x,
+        *distortion2 = vp8_get_inter_mbpred_error(x,
                                               &cpi->fn_ptr[BLOCK_16X16],
                                               sse, mv);
 
@@ -523,7 +523,7 @@
     int best_mode_index = 0;
     unsigned int sse = INT_MAX, best_rd_sse = INT_MAX;
 #if CONFIG_TEMPORAL_DENOISING
-    unsigned int zero_mv_sse = 0, best_sse = INT_MAX;
+    unsigned int zero_mv_sse = INT_MAX, best_sse = INT_MAX;
 #endif
 
     int_mv mvp;
@@ -964,25 +964,27 @@
 #if CONFIG_TEMPORAL_DENOISING
         if (cpi->oxcf.noise_sensitivity)
         {
-          // Store for later use by denoiser.
-          if (this_mode == ZEROMV &&
-              x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME)
-          {
-            zero_mv_sse = sse;
-          }
 
-          // Store the best NEWMV in x for later use in the denoiser.
-          // We are restricted to the LAST_FRAME since the denoiser only keeps
-          // one filter state.
-          if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
-              x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME)
-          {
-            best_sse = sse;
-            x->e_mbd.best_sse_inter_mode = NEWMV;
-            x->e_mbd.best_sse_mv = x->e_mbd.mode_info_context->mbmi.mv;
-            x->e_mbd.need_to_clamp_best_mvs =
-                x->e_mbd.mode_info_context->mbmi.need_to_clamp_mvs;
-          }
+            // Store for later use by denoiser.
+            if (this_mode == ZEROMV && sse < zero_mv_sse )
+            {
+                zero_mv_sse = sse;
+                x->best_zeromv_reference_frame =
+                        x->e_mbd.mode_info_context->mbmi.ref_frame;
+            }
+
+            // Store the best NEWMV in x for later use in the denoiser.
+            if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
+                    sse < best_sse)
+            {
+                best_sse = sse;
+                x->best_sse_inter_mode = NEWMV;
+                x->best_sse_mv = x->e_mbd.mode_info_context->mbmi.mv;
+                x->need_to_clamp_best_mvs =
+                    x->e_mbd.mode_info_context->mbmi.need_to_clamp_mvs;
+                x->best_reference_frame =
+                    x->e_mbd.mode_info_context->mbmi.ref_frame;
+            }
         }
 #endif
 
@@ -1058,37 +1060,47 @@
 #if CONFIG_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity)
     {
-      if (x->e_mbd.best_sse_inter_mode == DC_PRED) {
-        // No best MV found.
-        x->e_mbd.best_sse_inter_mode = best_mbmode.mode;
-        x->e_mbd.best_sse_mv = best_mbmode.mv;
-        x->e_mbd.need_to_clamp_best_mvs = best_mbmode.need_to_clamp_mvs;
-        best_sse = best_rd_sse;
-      }
-      vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
-                              recon_yoffset, recon_uvoffset);
-
-      // Reevaluate ZEROMV after denoising.
-      if (best_mbmode.ref_frame == INTRA_FRAME)
-      {
-        int this_rd = 0;
-        rate2 = 0;
-        distortion2 = 0;
-        x->e_mbd.mode_info_context->mbmi.ref_frame = LAST_FRAME;
-        rate2 += x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
-        this_mode = ZEROMV;
-        rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
-        x->e_mbd.mode_info_context->mbmi.mode = this_mode;
-        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
-        x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
-        this_rd = evaluate_inter_mode(&sse, rate2, &distortion2, cpi, x);
-
-        if (this_rd < best_rd || x->skip)
+        if (x->best_sse_inter_mode == DC_PRED)
         {
-            vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
-                       sizeof(MB_MODE_INFO));
+            // No best MV found.
+            x->best_sse_inter_mode = best_mbmode.mode;
+            x->best_sse_mv = best_mbmode.mv;
+            x->need_to_clamp_best_mvs = best_mbmode.need_to_clamp_mvs;
+            x->best_reference_frame = best_mbmode.ref_frame;
+            best_sse = best_rd_sse;
         }
-      }
+        vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
+                                recon_yoffset, recon_uvoffset);
+
+
+        // Reevaluate ZEROMV after denoising.
+        if (best_mbmode.ref_frame == INTRA_FRAME &&
+            x->best_zeromv_reference_frame != INTRA_FRAME)
+        {
+            int this_rd = 0;
+            int this_ref_frame = x->best_zeromv_reference_frame;
+            rate2 = x->ref_frame_cost[this_ref_frame] +
+                    vp8_cost_mv_ref(ZEROMV, mdcounts);
+            distortion2 = 0;
+
+            // set up the proper prediction buffers for the frame
+            x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
+            x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
+            x->e_mbd.pre.u_buffer = plane[this_ref_frame][1];
+            x->e_mbd.pre.v_buffer = plane[this_ref_frame][2];
+
+            x->e_mbd.mode_info_context->mbmi.mode = ZEROMV;
+            x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+            x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+            this_rd = evaluate_inter_mode(&sse, rate2, &distortion2, cpi, x);
+
+            if (this_rd < best_rd)
+            {
+                vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
+                           sizeof(MB_MODE_INFO));
+            }
+        }
+
     }
 #endif
 
diff --git a/vp8/encoder/pickinter.h b/vp8/encoder/pickinter.h
index 3d83782..6fbd887 100644
--- a/vp8/encoder/pickinter.h
+++ b/vp8/encoder/pickinter.h
@@ -20,4 +20,8 @@
                                 int mb_row, int mb_col);
 extern void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate);
 
+extern int vp8_get_inter_mbpred_error(MACROBLOCK *mb,
+                                      const vp8_variance_fn_ptr_t *vfp,
+                                      unsigned int *sse,
+                                      int_mv this_mv);
 #endif
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 2b706ba..27956b1 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -21,6 +21,7 @@
 #include "onyx_int.h"
 #include "modecosts.h"
 #include "encodeintra.h"
+#include "pickinter.h"
 #include "vp8/common/entropymode.h"
 #include "vp8/common/reconinter.h"
 #include "vp8/common/reconintra4x4.h"
@@ -36,7 +37,6 @@
 #if CONFIG_TEMPORAL_DENOISING
 #include "denoising.h"
 #endif
-
 extern void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x);
 
 #define MAXF(a,b)            (((a) > (b)) ? (a) : (b))
@@ -1962,6 +1962,11 @@
     int intra_rd_penalty =  10* vp8_dc_quant(cpi->common.base_qindex,
                                              cpi->common.y1dc_delta_q);
 
+#if CONFIG_TEMPORAL_DENOISING
+    unsigned int zero_mv_sse = INT_MAX, best_sse = INT_MAX,
+            best_rd_sse = INT_MAX;
+#endif
+
     mode_mv = mode_mv_sb[sign_bias];
     best_ref_mv.as_int = 0;
     best_mode.rd = INT_MAX;
@@ -2372,21 +2377,38 @@
           best_mode.intra_rd = this_rd;
             *returnintra = rd.distortion2 ;
         }
-
 #if CONFIG_TEMPORAL_DENOISING
         if (cpi->oxcf.noise_sensitivity)
         {
-          // Store the best NEWMV in x for later use in the denoiser.
-          // We are restricted to the LAST_FRAME since the denoiser only keeps
-          // one filter state.
-          if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
-              x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME)
-          {
-            x->e_mbd.best_sse_inter_mode = NEWMV;
-            x->e_mbd.best_sse_mv = x->e_mbd.mode_info_context->mbmi.mv;
-            x->e_mbd.need_to_clamp_best_mvs =
-                x->e_mbd.mode_info_context->mbmi.need_to_clamp_mvs;
-          }
+            unsigned int sse;
+            vp8_get_inter_mbpred_error(x,&cpi->fn_ptr[BLOCK_16X16],&sse,
+                                   mode_mv[this_mode]);
+
+            if (sse < best_rd_sse)
+                best_rd_sse = sse;
+
+            // Store for later use by denoiser.
+            if (this_mode == ZEROMV && sse < zero_mv_sse )
+            {
+                zero_mv_sse = sse;
+                x->best_zeromv_reference_frame =
+                        x->e_mbd.mode_info_context->mbmi.ref_frame;
+            }
+
+            // Store the best NEWMV in x for later use in the denoiser.
+            if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
+                    sse < best_sse)
+            {
+                best_sse = sse;
+                vp8_get_inter_mbpred_error(x,&cpi->fn_ptr[BLOCK_16X16],&best_sse,
+                                       mode_mv[this_mode]);
+                x->best_sse_inter_mode = NEWMV;
+                x->best_sse_mv = x->e_mbd.mode_info_context->mbmi.mv;
+                x->need_to_clamp_best_mvs =
+                    x->e_mbd.mode_info_context->mbmi.need_to_clamp_mvs;
+                x->best_reference_frame =
+                    x->e_mbd.mode_info_context->mbmi.ref_frame;
+            }
         }
 #endif
 
@@ -2459,42 +2481,55 @@
 #if CONFIG_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity)
     {
-      if (x->e_mbd.best_sse_inter_mode == DC_PRED) {
-        // No best MV found.
-        x->e_mbd.best_sse_inter_mode = best_mode.mbmode.mode;
-        x->e_mbd.best_sse_mv = best_mode.mbmode.mv;
-        x->e_mbd.need_to_clamp_best_mvs = best_mode.mbmode.need_to_clamp_mvs;
-      }
-
-      // TODO(holmer): No SSEs are calculated in rdopt.c. What else can be used?
-      vp8_denoiser_denoise_mb(&cpi->denoiser, x, 0, 0,
-                              recon_yoffset, recon_uvoffset);
-      // Reevalute ZEROMV if the current mode is INTRA.
-      if (best_mode.mbmode.ref_frame == INTRA_FRAME)
-      {
-        int this_rd = INT_MAX;
-        int disable_skip = 0;
-        int other_cost = 0;
-        vpx_memset(&rd, 0, sizeof(rd));
-        x->e_mbd.mode_info_context->mbmi.ref_frame = LAST_FRAME;
-        rd.rate2 += x->ref_frame_cost[LAST_FRAME];
-        rd.rate2 += vp8_cost_mv_ref(ZEROMV, mdcounts);
-        x->e_mbd.mode_info_context->mbmi.mode = ZEROMV;
-        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
-        x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
-        this_rd = evaluate_inter_mode_rd(mdcounts, &rd, &disable_skip, cpi, x);
-        this_rd = calculate_final_rd_costs(this_rd, &rd, &other_cost,
-                                           disable_skip, uv_intra_tteob,
-                                           intra_rd_penalty, cpi, x);
-        if (this_rd < best_mode.rd || x->skip)
+        if (x->best_sse_inter_mode == DC_PRED)
         {
-            // Note index of best mode so far
-            best_mode_index = mode_index;
-            *returnrate = rd.rate2;
-            *returndistortion = rd.distortion2;
-            update_best_mode(&best_mode, this_rd, &rd, other_cost, x);
+            // No best MV found.
+            x->best_sse_inter_mode = best_mode.mbmode.mode;
+            x->best_sse_mv = best_mode.mbmode.mv;
+            x->need_to_clamp_best_mvs = best_mode.mbmode.need_to_clamp_mvs;
+            x->best_reference_frame = best_mode.mbmode.ref_frame;
+            best_sse = best_rd_sse;
         }
-      }
+        vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
+                                recon_yoffset, recon_uvoffset);
+
+
+        // Reevaluate ZEROMV after denoising.
+        if (best_mode.mbmode.ref_frame == INTRA_FRAME &&
+            x->best_zeromv_reference_frame != INTRA_FRAME)
+        {
+            int this_rd = INT_MAX;
+            int disable_skip = 0;
+            int other_cost = 0;
+            int this_ref_frame = x->best_zeromv_reference_frame;
+            rd.rate2 = x->ref_frame_cost[this_ref_frame] +
+                    vp8_cost_mv_ref(ZEROMV, mdcounts);
+            rd.distortion2 = 0;
+
+            // set up the proper prediction buffers for the frame
+            x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
+            x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
+            x->e_mbd.pre.u_buffer = plane[this_ref_frame][1];
+            x->e_mbd.pre.v_buffer = plane[this_ref_frame][2];
+
+            x->e_mbd.mode_info_context->mbmi.mode = ZEROMV;
+            x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+            x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+
+            this_rd = evaluate_inter_mode_rd(mdcounts, &rd, &disable_skip, cpi, x);
+            this_rd = calculate_final_rd_costs(this_rd, &rd, &other_cost,
+                                               disable_skip, uv_intra_tteob,
+                                               intra_rd_penalty, cpi, x);
+            if (this_rd < best_mode.rd || x->skip)
+            {
+                // Note index of best mode so far
+                best_mode_index = mode_index;
+                *returnrate = rd.rate2;
+                *returndistortion = rd.distortion2;
+                update_best_mode(&best_mode, this_rd, &rd, other_cost, x);
+            }
+        }
+
     }
 #endif
 
diff --git a/vp8/encoder/x86/dct_mmx.asm b/vp8/encoder/x86/dct_mmx.asm
index f07b030..6f188cb 100644
--- a/vp8/encoder/x86/dct_mmx.asm
+++ b/vp8/encoder/x86/dct_mmx.asm
@@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 ;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
-global sym(vp8_short_fdct4x4_mmx)
+global sym(vp8_short_fdct4x4_mmx) PRIVATE
 sym(vp8_short_fdct4x4_mmx):
     push        rbp
     mov         rbp,        rsp
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
index 3d52a5d..d880ce0 100644
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -61,7 +61,7 @@
 %endmacro
 
 ;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
-global sym(vp8_short_fdct4x4_sse2)
+global sym(vp8_short_fdct4x4_sse2) PRIVATE
 sym(vp8_short_fdct4x4_sse2):
 
     STACK_FRAME_CREATE
@@ -166,7 +166,7 @@
     STACK_FRAME_DESTROY
 
 ;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
-global sym(vp8_short_fdct8x4_sse2)
+global sym(vp8_short_fdct8x4_sse2) PRIVATE
 sym(vp8_short_fdct8x4_sse2):
 
     STACK_FRAME_CREATE
diff --git a/vp8/encoder/x86/denoising_sse2.c b/vp8/encoder/x86/denoising_sse2.c
new file mode 100644
index 0000000..41991c2
--- /dev/null
+++ b/vp8/encoder/x86/denoising_sse2.c
@@ -0,0 +1,153 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/encoder/denoising.h"
+
+#include "vp8/common/reconinter.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_rtcd.h"
+
+#include <emmintrin.h>
+
+union sum_union {
+    __m128i v;
+    short e[8];
+};
+
+int vp8_denoiser_filter_sse2(YV12_BUFFER_CONFIG *mc_running_avg,
+                             YV12_BUFFER_CONFIG *running_avg,
+                             MACROBLOCK *signal, unsigned int motion_magnitude,
+                             int y_offset, int uv_offset)
+{
+    unsigned char filtered_buf[16*16];
+    unsigned char *filtered = filtered_buf;
+    unsigned char *sig = signal->thismb;
+    int sig_stride = 16;
+    unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
+    int mc_avg_y_stride = mc_running_avg->y_stride;
+    unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
+    int avg_y_stride = running_avg->y_stride;
+    const union coeff_pair *LUT = vp8_get_filter_coeff_LUT(motion_magnitude);
+    int r, c;
+    __m128i acc_diff = { 0 };
+
+    for (r = 0; r < 16; ++r)
+    {
+        __m128i filter_coefficient_00, filter_coefficient_04;
+        __m128i filter_coefficient_08, filter_coefficient_12;
+        __m128i v_sig0, v_sig1;
+        __m128i v_mc_running_avg_y0, v_mc_running_avg_y1;
+        __m128i state0, state1, state2, state3;
+        __m128i res0, res1, res2, res3;
+        __m128i v_running_avg_y;
+        __m128i diff0, diff1, diff0sq, diff1sq, diff_sq;
+        const __m128i kNOISE_DIFF2_THRESHOLD =
+                _mm_set1_epi8(NOISE_DIFF2_THRESHOLD);
+        __m128i take_running, p0, p1, p2;
+        const __m128i k_zero = _mm_set1_epi16(0);
+        const __m128i k_128 = _mm_set1_epi32(128);
+
+        // Calculate absolute differences
+        DECLARE_ALIGNED_ARRAY(16,unsigned char,abs_diff,16);
+        DECLARE_ALIGNED_ARRAY(16,uint32_t,filter_coefficient,16);
+        __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
+        __m128i v_mc_running_avg_y = _mm_loadu_si128(
+                                         (__m128i *)(&mc_running_avg_y[0]));
+        __m128i a_minus_b = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+        __m128i b_minus_a = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+        __m128i v_abs_diff = _mm_adds_epu8(a_minus_b, b_minus_a);
+        _mm_store_si128((__m128i *)(&abs_diff[0]), v_abs_diff);
+
+        // Use LUT to get filter coefficients (two 16b value; f and 256-f)
+        for (c = 0; c < 16; ++c)
+        {
+            filter_coefficient[c] = LUT[abs_diff[c]].as_int;
+        }
+
+        // Filtering...
+        // load filter coefficients (two 16b value; f and 256-f)
+        filter_coefficient_00 = _mm_load_si128(
+                (__m128i *)(&filter_coefficient[ 0]));
+        filter_coefficient_04 = _mm_load_si128(
+                (__m128i *)(&filter_coefficient[ 4]));
+        filter_coefficient_08 = _mm_load_si128(
+                (__m128i *)(&filter_coefficient[ 8]));
+        filter_coefficient_12 = _mm_load_si128(
+                (__m128i *)(&filter_coefficient[12]));
+
+        // expand sig from 8b to 16b
+        v_sig0 = _mm_unpacklo_epi8(v_sig, k_zero);
+        v_sig1 = _mm_unpackhi_epi8(v_sig, k_zero);
+        // expand mc_running_avg_y from 8b to 16b
+        v_mc_running_avg_y0 = _mm_unpacklo_epi8(v_mc_running_avg_y, k_zero);
+        v_mc_running_avg_y1 = _mm_unpackhi_epi8(v_mc_running_avg_y, k_zero);
+        // interleave sig and mc_running_avg_y for upcoming multiply-add
+        state0 = _mm_unpacklo_epi16(v_mc_running_avg_y0, v_sig0);
+        state1 = _mm_unpackhi_epi16(v_mc_running_avg_y0, v_sig0);
+        state2 = _mm_unpacklo_epi16(v_mc_running_avg_y1, v_sig1);
+        state3 = _mm_unpackhi_epi16(v_mc_running_avg_y1, v_sig1);
+        // blend values
+        res0 = _mm_madd_epi16(filter_coefficient_00, state0);
+        res1 = _mm_madd_epi16(filter_coefficient_04, state1);
+        res2 = _mm_madd_epi16(filter_coefficient_08, state2);
+        res3 = _mm_madd_epi16(filter_coefficient_12, state3);
+        res0 = _mm_add_epi32(res0, k_128);
+        res1 = _mm_add_epi32(res1, k_128);
+        res2 = _mm_add_epi32(res2, k_128);
+        res3 = _mm_add_epi32(res3, k_128);
+        res0 = _mm_srai_epi32(res0, 8);
+        res1 = _mm_srai_epi32(res1, 8);
+        res2 = _mm_srai_epi32(res2, 8);
+        res3 = _mm_srai_epi32(res3, 8);
+        // combine the 32b results into a single 8b vector
+        res0 = _mm_packs_epi32(res0, res1);
+        res2 = _mm_packs_epi32(res2, res3);
+        v_running_avg_y = _mm_packus_epi16(res0, res2);
+
+        // Depending on the magnitude of the difference between the signal and
+        // filtered version, either replace the signal by the filtered one or
+        // update the filter state with the signal when the change in a pixel
+        // isn't classified as noise.
+        diff0 = _mm_sub_epi16(v_sig0, res0);
+        diff1 = _mm_sub_epi16(v_sig1, res2);
+        acc_diff = _mm_add_epi16(acc_diff, _mm_add_epi16(diff0, diff1));
+
+        diff0sq = _mm_mullo_epi16(diff0, diff0);
+        diff1sq = _mm_mullo_epi16(diff1, diff1);
+        diff_sq = _mm_packus_epi16(diff0sq, diff1sq);
+        take_running = _mm_cmplt_epi8(diff_sq, kNOISE_DIFF2_THRESHOLD);
+        p0 = _mm_and_si128(take_running, v_running_avg_y);
+        p1 = _mm_andnot_si128(take_running, v_sig);
+        p2 = _mm_or_si128(p0, p1);
+        _mm_storeu_si128((__m128i *)(&running_avg_y[0]), p2);
+        _mm_storeu_si128((__m128i *)(&filtered[0]), p2);
+
+        // Update pointers for next iteration.
+        sig += sig_stride;
+        filtered += 16;
+        mc_running_avg_y += mc_avg_y_stride;
+        running_avg_y += avg_y_stride;
+    }
+    {
+        // Compute the sum of all pixel differences of this MB.
+        union sum_union s;
+        int sum_diff;
+        s.v = acc_diff;
+        sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] +
+          s.e[4] + s.e[5] + s.e[6] + s.e[7];
+        if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
+        {
+            return COPY_BLOCK;
+        }
+    }
+    vp8_copy_mem16x16(filtered_buf, 16, signal->thismb, sig_stride);
+    return FILTER_BLOCK;
+}
diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm
index 7ec7d60..fe26b18 100644
--- a/vp8/encoder/x86/encodeopt.asm
+++ b/vp8/encoder/x86/encodeopt.asm
@@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 ;int vp8_block_error_xmm(short *coeff_ptr,  short *dcoef_ptr)
-global sym(vp8_block_error_xmm)
+global sym(vp8_block_error_xmm) PRIVATE
 sym(vp8_block_error_xmm):
     push        rbp
     mov         rbp, rsp
@@ -60,7 +60,7 @@
     ret
 
 ;int vp8_block_error_mmx(short *coeff_ptr,  short *dcoef_ptr)
-global sym(vp8_block_error_mmx)
+global sym(vp8_block_error_mmx) PRIVATE
 sym(vp8_block_error_mmx):
     push        rbp
     mov         rbp, rsp
@@ -126,7 +126,7 @@
 
 
 ;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-global sym(vp8_mbblock_error_mmx_impl)
+global sym(vp8_mbblock_error_mmx_impl) PRIVATE
 sym(vp8_mbblock_error_mmx_impl):
     push        rbp
     mov         rbp, rsp
@@ -203,7 +203,7 @@
 
 
 ;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-global sym(vp8_mbblock_error_xmm_impl)
+global sym(vp8_mbblock_error_xmm_impl) PRIVATE
 sym(vp8_mbblock_error_xmm_impl):
     push        rbp
     mov         rbp, rsp
@@ -273,7 +273,7 @@
 
 
 ;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
-global sym(vp8_mbuverror_mmx_impl)
+global sym(vp8_mbuverror_mmx_impl) PRIVATE
 sym(vp8_mbuverror_mmx_impl):
     push        rbp
     mov         rbp, rsp
@@ -330,7 +330,7 @@
 
 
 ;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
-global sym(vp8_mbuverror_xmm_impl)
+global sym(vp8_mbuverror_xmm_impl) PRIVATE
 sym(vp8_mbuverror_xmm_impl):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/encoder/x86/fwalsh_sse2.asm b/vp8/encoder/x86/fwalsh_sse2.asm
index 71efd56..f4989279 100644
--- a/vp8/encoder/x86/fwalsh_sse2.asm
+++ b/vp8/encoder/x86/fwalsh_sse2.asm
@@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 ;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
-global sym(vp8_short_walsh4x4_sse2)
+global sym(vp8_short_walsh4x4_sse2) PRIVATE
 sym(vp8_short_walsh4x4_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/encoder/x86/quantize_mmx.asm b/vp8/encoder/x86/quantize_mmx.asm
index f29a54e..2864ce1 100644
--- a/vp8/encoder/x86/quantize_mmx.asm
+++ b/vp8/encoder/x86/quantize_mmx.asm
@@ -15,7 +15,7 @@
 ;                           short *qcoeff_ptr,short *dequant_ptr,
 ;                           short *scan_mask, short *round_ptr,
 ;                           short *quant_ptr, short *dqcoeff_ptr);
-global sym(vp8_fast_quantize_b_impl_mmx)
+global sym(vp8_fast_quantize_b_impl_mmx) PRIVATE
 sym(vp8_fast_quantize_b_impl_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index 7c249ff..724e54c 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -16,7 +16,7 @@
 ;  (BLOCK  *b,                     |  0
 ;   BLOCKD *d)                     |  1
 
-global sym(vp8_regular_quantize_b_sse2)
+global sym(vp8_regular_quantize_b_sse2) PRIVATE
 sym(vp8_regular_quantize_b_sse2):
     push        rbp
     mov         rbp, rsp
@@ -240,7 +240,7 @@
 ;  (BLOCK  *b,                  |  0
 ;   BLOCKD *d)                  |  1
 
-global sym(vp8_fast_quantize_b_sse2)
+global sym(vp8_fast_quantize_b_sse2) PRIVATE
 sym(vp8_fast_quantize_b_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/encoder/x86/quantize_sse4.asm b/vp8/encoder/x86/quantize_sse4.asm
index 70eac0c..f0e5d40 100644
--- a/vp8/encoder/x86/quantize_sse4.asm
+++ b/vp8/encoder/x86/quantize_sse4.asm
@@ -16,7 +16,7 @@
 ;  (BLOCK  *b,                     |  0
 ;   BLOCKD *d)                     |  1
 
-global sym(vp8_regular_quantize_b_sse4)
+global sym(vp8_regular_quantize_b_sse4) PRIVATE
 sym(vp8_regular_quantize_b_sse4):
 
 %if ABI_IS_32BIT
diff --git a/vp8/encoder/x86/quantize_ssse3.asm b/vp8/encoder/x86/quantize_ssse3.asm
index e698e90..dd526f4 100644
--- a/vp8/encoder/x86/quantize_ssse3.asm
+++ b/vp8/encoder/x86/quantize_ssse3.asm
@@ -17,7 +17,7 @@
 ;   BLOCKD *d)                   |  1
 ;
 
-global sym(vp8_fast_quantize_b_ssse3)
+global sym(vp8_fast_quantize_b_ssse3) PRIVATE
 sym(vp8_fast_quantize_b_ssse3):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm
index c6db3d1..5964a85 100644
--- a/vp8/encoder/x86/ssim_opt.asm
+++ b/vp8/encoder/x86/ssim_opt.asm
@@ -61,7 +61,7 @@
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(vp8_ssim_parms_16x16_sse2)
+global sym(vp8_ssim_parms_16x16_sse2) PRIVATE
 sym(vp8_ssim_parms_16x16_sse2):
     push        rbp
     mov         rbp, rsp
@@ -151,7 +151,7 @@
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(vp8_ssim_parms_8x8_sse2)
+global sym(vp8_ssim_parms_8x8_sse2) PRIVATE
 sym(vp8_ssim_parms_8x8_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm
index 75e8aa3..794dd22 100644
--- a/vp8/encoder/x86/subtract_mmx.asm
+++ b/vp8/encoder/x86/subtract_mmx.asm
@@ -14,7 +14,7 @@
 ;void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
 ;                            short *diff, unsigned char *Predictor,
 ;                            int pitch);
-global sym(vp8_subtract_b_mmx_impl)
+global sym(vp8_subtract_b_mmx_impl) PRIVATE
 sym(vp8_subtract_b_mmx_impl):
     push        rbp
     mov         rbp, rsp
@@ -75,7 +75,7 @@
 
 ;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride,
 ;unsigned char *pred, int pred_stride)
-global sym(vp8_subtract_mby_mmx)
+global sym(vp8_subtract_mby_mmx) PRIVATE
 sym(vp8_subtract_mby_mmx):
     push        rbp
     mov         rbp, rsp
@@ -150,7 +150,7 @@
 ;                         int src_stride, unsigned char *upred,
 ;                         unsigned char *vpred, int pred_stride)
 
-global sym(vp8_subtract_mbuv_mmx)
+global sym(vp8_subtract_mbuv_mmx) PRIVATE
 sym(vp8_subtract_mbuv_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm
index 008e9c7..a5d17f5 100644
--- a/vp8/encoder/x86/subtract_sse2.asm
+++ b/vp8/encoder/x86/subtract_sse2.asm
@@ -14,7 +14,7 @@
 ;void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
 ;                            short *diff, unsigned char *Predictor,
 ;                            int pitch);
-global sym(vp8_subtract_b_sse2_impl)
+global sym(vp8_subtract_b_sse2_impl) PRIVATE
 sym(vp8_subtract_b_sse2_impl):
     push        rbp
     mov         rbp, rsp
@@ -73,7 +73,7 @@
 
 ;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride,
 ;unsigned char *pred, int pred_stride)
-global sym(vp8_subtract_mby_sse2)
+global sym(vp8_subtract_mby_sse2) PRIVATE
 sym(vp8_subtract_mby_sse2):
     push        rbp
     mov         rbp, rsp
@@ -146,7 +146,7 @@
 ;vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc,
 ;                         int src_stride, unsigned char *upred,
 ;                         unsigned char *vpred, int pred_stride)
-global sym(vp8_subtract_mbuv_sse2)
+global sym(vp8_subtract_mbuv_sse2) PRIVATE
 sym(vp8_subtract_mbuv_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
index b97c694..ce9d983 100644
--- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -20,7 +20,7 @@
 ;   int             filter_weight,    |  5
 ;   unsigned int   *accumulator,      |  6
 ;   unsigned short *count)            |  7
-global sym(vp8_temporal_filter_apply_sse2)
+global sym(vp8_temporal_filter_apply_sse2) PRIVATE
 sym(vp8_temporal_filter_apply_sse2):
 
     push        rbp
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 019edbd..5976297 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -99,6 +99,14 @@
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
+
+ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
+ifeq ($(HAVE_SSE2),yes)
+vp8/encoder/x86/denoising_sse2.c.o: CFLAGS += -msse2
+endif
+endif
+
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
diff --git a/vp8_scalable_patterns.c b/vp8_scalable_patterns.c
index 4311b1a..9351874 100644
--- a/vp8_scalable_patterns.c
+++ b/vp8_scalable_patterns.c
@@ -493,7 +493,7 @@
     // Cap CPU & first I-frame size
     vpx_codec_control (&codec, VP8E_SET_CPUUSED,                -6);
     vpx_codec_control (&codec, VP8E_SET_STATIC_THRESHOLD,      800);
-    vpx_codec_control (&codec, VP8E_SET_NOISE_SENSITIVITY,       2);
+    vpx_codec_control (&codec, VP8E_SET_NOISE_SENSITIVITY,       1);
 
     max_intra_size_pct = (int) (((double)cfg.rc_buf_optimal_sz * 0.5)
                          * ((double) cfg.g_timebase.den / cfg.g_timebase.num)
diff --git a/vpx_ports/emms.asm b/vpx_ports/emms.asm
index 306e235..efad1a5 100644
--- a/vpx_ports/emms.asm
+++ b/vpx_ports/emms.asm
@@ -12,14 +12,14 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 section .text
-    global sym(vpx_reset_mmx_state)
+global sym(vpx_reset_mmx_state) PRIVATE
 sym(vpx_reset_mmx_state):
     emms
     ret
 
 
 %ifidn __OUTPUT_FORMAT__,x64
-global sym(vpx_winx64_fldcw)
+global sym(vpx_winx64_fldcw) PRIVATE
 sym(vpx_winx64_fldcw):
     sub   rsp, 8
     mov   [rsp], rcx ; win x64 specific
@@ -28,7 +28,7 @@
     ret
 
 
-global sym(vpx_winx64_fstcw)
+global sym(vpx_winx64_fstcw) PRIVATE
 sym(vpx_winx64_fstcw):
     sub   rsp, 8
     fstcw [rsp]
diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm
index cef6a0b..e1a540c 100644
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm
@@ -94,6 +94,31 @@
 %define sym(x) _ %+ x
 %endif
 
+;  PRIVATE
+;  Macro for the attribute to hide a global symbol for the target ABI.
+;  This is only active if CHROMIUM is defined.
+;
+;  Chromium doesn't like exported global symbols due to symbol clashing with
+;  plugins among other things.
+;
+;  Requires Chromium's patched copy of yasm:
+;    http://src.chromium.org/viewvc/chrome?view=rev&revision=73761
+;    http://www.tortall.net/projects/yasm/ticket/236
+;
+%ifdef CHROMIUM
+  %ifidn   __OUTPUT_FORMAT__,elf32
+    %define PRIVATE :hidden
+  %elifidn __OUTPUT_FORMAT__,elf64
+    %define PRIVATE :hidden
+  %elifidn __OUTPUT_FORMAT__,x64
+    %define PRIVATE
+  %else
+    %define PRIVATE :private_extern
+  %endif
+%else
+  %define PRIVATE
+%endif
+
 ; arg()
 ; Return the address specification of the given argument
 ;
@@ -181,7 +206,16 @@
     %endmacro
   %endif
   %endif
-  %define HIDDEN_DATA(x) x
+
+  %ifdef CHROMIUM
+    %ifidn __OUTPUT_FORMAT__,macho32
+      %define HIDDEN_DATA(x) x:private_extern
+    %else
+      %define HIDDEN_DATA(x) x
+    %endif
+  %else
+    %define HIDDEN_DATA(x) x
+  %endif
 %else
   %macro GET_GOT 1
   %endmacro