vpx_dsp/x86/post_proc_sse2.c - webm/libvpx - Git at Google

 /*
  *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */

 #include <assert.h>
 #include <emmintrin.h>

 #include <stdio.h>

 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/x86/mem_sse2.h"

 extern const int16_t vpx_rv[];

 void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows,
                                int cols, int flimit) {
   int col;
   const __m128i zero = _mm_setzero_si128();
   const __m128i f = _mm_set1_epi32(flimit);
   DECLARE_ALIGNED(16, int16_t, above_context[8 * 8]);

   // 8 columns are processed at a time.
   // If rows is less than 8 the bottom border extension fails.
   assert(cols % 8 == 0);
   assert(rows >= 8);

   for (col = 0; col < cols; col += 8) {
     int row, i;
     __m128i s = _mm_loadl_epi64((__m128i *)dst);
     __m128i sum, sumsq_0, sumsq_1;
     __m128i tmp_0, tmp_1;
     __m128i below_context;

     s = _mm_unpacklo_epi8(s, zero);

     for (i = 0; i < 8; ++i) {
       _mm_store_si128((__m128i *)above_context + i, s);
     }

     // sum *= 9
     sum = _mm_slli_epi16(s, 3);
     sum = _mm_add_epi16(s, sum);

     // sum^2 * 9 == (sum * 9) * sum
     tmp_0 = _mm_mullo_epi16(sum, s);
     tmp_1 = _mm_mulhi_epi16(sum, s);

     sumsq_0 = _mm_unpacklo_epi16(tmp_0, tmp_1);
     sumsq_1 = _mm_unpackhi_epi16(tmp_0, tmp_1);

     // Prime sum/sumsq
     for (i = 1; i <= 6; ++i) {
       __m128i a = _mm_loadl_epi64((__m128i *)(dst + i * pitch));
       a = _mm_unpacklo_epi8(a, zero);
       sum = _mm_add_epi16(sum, a);
       a = _mm_mullo_epi16(a, a);
       sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(a, zero));
       sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(a, zero));
     }

     for (row = 0; row < rows + 8; row++) {
       const __m128i above =
           _mm_load_si128((__m128i *)above_context + (row & 7));
       __m128i this_row = _mm_loadl_epi64((__m128i *)(dst + row * pitch));
       __m128i above_sq, below_sq;
       __m128i mask_0, mask_1;
       __m128i multmp_0, multmp_1;
       __m128i rv;
       __m128i out;

       this_row = _mm_unpacklo_epi8(this_row, zero);

       if (row + 7 < rows) {
         // Instead of copying the end context we just stop loading when we get
         // to the last one.
         below_context = _mm_loadl_epi64((__m128i *)(dst + (row + 7) * pitch));
         below_context = _mm_unpacklo_epi8(below_context, zero);
       }

       sum = _mm_sub_epi16(sum, above);
       sum = _mm_add_epi16(sum, below_context);

       // context^2 fits in 16 bits. Don't need to mulhi and combine. Just zero
       // extend. Unfortunately we can't do below_sq - above_sq in 16 bits
       // because x86 does not have unpack with sign extension.
       above_sq = _mm_mullo_epi16(above, above);
       sumsq_0 = _mm_sub_epi32(sumsq_0, _mm_unpacklo_epi16(above_sq, zero));
       sumsq_1 = _mm_sub_epi32(sumsq_1, _mm_unpackhi_epi16(above_sq, zero));

       below_sq = _mm_mullo_epi16(below_context, below_context);
       sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(below_sq, zero));
       sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(below_sq, zero));

       // sumsq * 16 - sumsq == sumsq * 15
       mask_0 = _mm_slli_epi32(sumsq_0, 4);
       mask_0 = _mm_sub_epi32(mask_0, sumsq_0);
       mask_1 = _mm_slli_epi32(sumsq_1, 4);
       mask_1 = _mm_sub_epi32(mask_1, sumsq_1);

       multmp_0 = _mm_mullo_epi16(sum, sum);
       multmp_1 = _mm_mulhi_epi16(sum, sum);

       mask_0 = _mm_sub_epi32(mask_0, _mm_unpacklo_epi16(multmp_0, multmp_1));
       mask_1 = _mm_sub_epi32(mask_1, _mm_unpackhi_epi16(multmp_0, multmp_1));

       // mask - f gives a negative value when mask < f
       mask_0 = _mm_sub_epi32(mask_0, f);
       mask_1 = _mm_sub_epi32(mask_1, f);

       // Shift the sign bit down to create a mask
       mask_0 = _mm_srai_epi32(mask_0, 31);
       mask_1 = _mm_srai_epi32(mask_1, 31);

       mask_0 = _mm_packs_epi32(mask_0, mask_1);

       rv = _mm_loadu_si128((__m128i const *)(vpx_rv + (row & 127)));

       mask_1 = _mm_add_epi16(rv, sum);
       mask_1 = _mm_add_epi16(mask_1, this_row);
       mask_1 = _mm_srai_epi16(mask_1, 4);

       mask_1 = _mm_and_si128(mask_0, mask_1);
       mask_0 = _mm_andnot_si128(mask_0, this_row);
       out = _mm_or_si128(mask_1, mask_0);

       _mm_storel_epi64((__m128i *)(dst + row * pitch),
                        _mm_packus_epi16(out, zero));

       _mm_store_si128((__m128i *)above_context + ((row + 8) & 7), this_row);
     }

     dst += 8;
   }
 }
	/*
	* Copyright (c) 2018 The WebM project authors. All Rights Reserved.
	*
	* Use of this source code is governed by a BSD-style license
	* that can be found in the LICENSE file in the root of the source
	* tree. An additional intellectual property rights grant can be found
	* in the file PATENTS. All contributing project authors may
	* be found in the AUTHORS file in the root of the source tree.
	*/

	#include <assert.h>
	#include <emmintrin.h>

	#include <stdio.h>

	#include "./vpx_dsp_rtcd.h"
	#include "vpx/vpx_integer.h"
	#include "vpx_dsp/x86/mem_sse2.h"

	extern const int16_t vpx_rv[];

	void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows,
	int cols, int flimit) {
	int col;
	const __m128i zero = _mm_setzero_si128();
	const __m128i f = _mm_set1_epi32(flimit);
	DECLARE_ALIGNED(16, int16_t, above_context[8 * 8]);

	// 8 columns are processed at a time.
	// If rows is less than 8 the bottom border extension fails.
	assert(cols % 8 == 0);
	assert(rows >= 8);

	for (col = 0; col < cols; col += 8) {
	int row, i;
	__m128i s = _mm_loadl_epi64((__m128i *)dst);
	__m128i sum, sumsq_0, sumsq_1;
	__m128i tmp_0, tmp_1;
	__m128i below_context;

	s = _mm_unpacklo_epi8(s, zero);

	for (i = 0; i < 8; ++i) {
	_mm_store_si128((__m128i *)above_context + i, s);
	}

	// sum *= 9
	sum = _mm_slli_epi16(s, 3);
	sum = _mm_add_epi16(s, sum);

	// sum^2 * 9 == (sum * 9) * sum
	tmp_0 = _mm_mullo_epi16(sum, s);
	tmp_1 = _mm_mulhi_epi16(sum, s);

	sumsq_0 = _mm_unpacklo_epi16(tmp_0, tmp_1);
	sumsq_1 = _mm_unpackhi_epi16(tmp_0, tmp_1);

	// Prime sum/sumsq
	for (i = 1; i <= 6; ++i) {
	__m128i a = _mm_loadl_epi64((__m128i )(dst + i pitch));
	a = _mm_unpacklo_epi8(a, zero);
	sum = _mm_add_epi16(sum, a);
	a = _mm_mullo_epi16(a, a);
	sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(a, zero));
	sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(a, zero));
	}

	for (row = 0; row < rows + 8; row++) {
	const __m128i above =
	_mm_load_si128((__m128i *)above_context + (row & 7));
	__m128i this_row = _mm_loadl_epi64((__m128i )(dst + row pitch));
	__m128i above_sq, below_sq;
	__m128i mask_0, mask_1;
	__m128i multmp_0, multmp_1;
	__m128i rv;
	__m128i out;

	this_row = _mm_unpacklo_epi8(this_row, zero);

	if (row + 7 < rows) {
	// Instead of copying the end context we just stop loading when we get
	// to the last one.
	below_context = _mm_loadl_epi64((__m128i )(dst + (row + 7) pitch));
	below_context = _mm_unpacklo_epi8(below_context, zero);
	}

	sum = _mm_sub_epi16(sum, above);
	sum = _mm_add_epi16(sum, below_context);

	// context^2 fits in 16 bits. Don't need to mulhi and combine. Just zero
	// extend. Unfortunately we can't do below_sq - above_sq in 16 bits
	// because x86 does not have unpack with sign extension.
	above_sq = _mm_mullo_epi16(above, above);
	sumsq_0 = _mm_sub_epi32(sumsq_0, _mm_unpacklo_epi16(above_sq, zero));
	sumsq_1 = _mm_sub_epi32(sumsq_1, _mm_unpackhi_epi16(above_sq, zero));

	below_sq = _mm_mullo_epi16(below_context, below_context);
	sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(below_sq, zero));
	sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(below_sq, zero));

	// sumsq * 16 - sumsq == sumsq * 15
	mask_0 = _mm_slli_epi32(sumsq_0, 4);
	mask_0 = _mm_sub_epi32(mask_0, sumsq_0);
	mask_1 = _mm_slli_epi32(sumsq_1, 4);
	mask_1 = _mm_sub_epi32(mask_1, sumsq_1);

	multmp_0 = _mm_mullo_epi16(sum, sum);
	multmp_1 = _mm_mulhi_epi16(sum, sum);

	mask_0 = _mm_sub_epi32(mask_0, _mm_unpacklo_epi16(multmp_0, multmp_1));
	mask_1 = _mm_sub_epi32(mask_1, _mm_unpackhi_epi16(multmp_0, multmp_1));

	// mask - f gives a negative value when mask < f
	mask_0 = _mm_sub_epi32(mask_0, f);
	mask_1 = _mm_sub_epi32(mask_1, f);

	// Shift the sign bit down to create a mask
	mask_0 = _mm_srai_epi32(mask_0, 31);
	mask_1 = _mm_srai_epi32(mask_1, 31);

	mask_0 = _mm_packs_epi32(mask_0, mask_1);

	rv = _mm_loadu_si128((__m128i const *)(vpx_rv + (row & 127)));

	mask_1 = _mm_add_epi16(rv, sum);
	mask_1 = _mm_add_epi16(mask_1, this_row);
	mask_1 = _mm_srai_epi16(mask_1, 4);

	mask_1 = _mm_and_si128(mask_0, mask_1);
	mask_0 = _mm_andnot_si128(mask_0, this_row);
	out = _mm_or_si128(mask_1, mask_0);

	_mm_storel_epi64((__m128i )(dst + row pitch),
	_mm_packus_epi16(out, zero));

	_mm_store_si128((__m128i *)above_context + ((row + 8) & 7), this_row);
	}

	dst += 8;
	}
	}