vpx_scale/dm642/bicubic_scaler_c64.c - webm/libvpx - Git at Google

 /*
  *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license and patent
  *  grant that can be found in the LICENSE file in the root of the source
  *  tree. All contributing project authors may be found in the AUTHORS
  *  file in the root of the source tree.
  */


 #include <float.h>
 #include <math.h>
 #include <stdio.h>
 #include "vpx_mem/vpx_mem.h"
 #include "vpxscale_arbitrary.h"

 extern BICUBIC_SCALER_STRUCT g_b_scaler;

 int bicubic_scale_c64(int in_width, int in_height, int in_stride,
                       int out_width, int out_height, int out_stride,
                       unsigned char *input_image, unsigned char *output_image)
 {
     short *restrict l_w, * restrict l_h;
     short *restrict c_w, * restrict c_h;
     unsigned char *restrict ip, * restrict op, *restrict op_w;
     unsigned char *restrict hbuf;
     int h, w, lw, lh;
     int phase_offset_w, phase_offset_h;
     double coeff;
     int max_phase;

     c_w = g_b_scaler.c_w;
     c_h = g_b_scaler.c_h;

     op = output_image;

     l_w = g_b_scaler.l_w;
     l_h = g_b_scaler.l_h;

     phase_offset_h = 0;

     for (h = 0; h < out_height; h++)
     {
         // select the row to work on
         lh = l_h[h];
         ip = input_image + (in_stride * lh);

         coeff = _memd8_const(&c_h[phase_offset_h*4]);

         // vp8_filter the row vertically into an temporary buffer.
         //  If the phase offset == 0 then all the multiplication
         //  is going to result in the output equalling the input.
         //  So instead point the temporary buffer to the input.
         //  Also handle the boundry condition of not being able to
         //  filter that last lines.
         if (phase_offset_h && (lh < in_height - 2))
         {
             hbuf = g_b_scaler.hbuf;

             for (w = 0; w < in_width; w += 4)
             {
                 int ip1, ip2, ip3, ip4;
                 int y13_12, y11_10, y23_22, y21_20, y33_32, y31_30, y43_42, y41_40;
                 int y10_20, y11_21, y12_22, y13_23, y30_40, y31_41, y32_42, y33_43;
                 int s1, s2, s3, s4;

                 ip1 = _mem4_const(&ip[w - in_stride]);
                 ip2 = _mem4_const(&ip[w]);
                 ip3 = _mem4_const(&ip[w + in_stride]);
                 ip4 = _mem4_const(&ip[w + 2*in_stride]);

                 // realignment of data.  Unpack the data so that it is in short
                 //  format instead of bytes.
                 y13_12 = _unpkhu4(ip1);
                 y11_10 = _unpklu4(ip1);
                 y23_22 = _unpkhu4(ip2);
                 y21_20 = _unpklu4(ip2);
                 y33_32 = _unpkhu4(ip3);
                 y31_30 = _unpklu4(ip3);
                 y43_42 = _unpkhu4(ip4);
                 y41_40 = _unpklu4(ip4);

                 // repack the data so that elements 1 and 2 are together.  this
                 //  lines up so that a dot product with the coefficients can be
                 //  done.
                 y10_20 = _pack2(y11_10, y21_20);
                 y11_21 = _packh2(y11_10, y21_20);
                 y12_22 = _pack2(y13_12, y23_22);
                 y13_23 = _packh2(y13_12, y23_22);

                 s1 = _dotp2(_hi(coeff), y10_20);
                 s2 = _dotp2(_hi(coeff), y11_21);
                 s3 = _dotp2(_hi(coeff), y12_22);
                 s4 = _dotp2(_hi(coeff), y13_23);

                 y30_40 = _pack2(y31_30, y41_40);
                 y31_41 = _packh2(y31_30, y41_40);
                 y32_42 = _pack2(y33_32, y43_42);
                 y33_43 = _packh2(y33_32, y43_42);

                 // now repack elements 3 and 4 together.
                 s1 += _dotp2(_lo(coeff), y30_40);
                 s2 += _dotp2(_lo(coeff), y31_41);
                 s3 += _dotp2(_lo(coeff), y32_42);
                 s4 += _dotp2(_lo(coeff), y33_43);

                 s1 = s1 >> 12;
                 s2 = s2 >> 12;
                 s3 = s3 >> 12;
                 s4 = s4 >> 12;

                 s1 = _pack2(s2, s1);
                 s2 = _pack2(s4, s3);

                 _amem4(&hbuf[w])  = _spacku4(s2, s1);
             }
         }
         else
             hbuf = ip;

         // increase the phase offset for the next time around.
         if (++phase_offset_h >= g_b_scaler.nh)
             phase_offset_h = 0;

         op_w = op;

         // will never be able to interpolate first pixel, so just copy it
         // over here.
         phase_offset_w = 1;
         *op_w++ = hbuf[0];

         if (1 >= g_b_scaler.nw) phase_offset_w = 0;

         max_phase = g_b_scaler.nw;

         for (w = 1; w < out_width; w++)
         {
             double coefficients;
             int hbuf_high, hbuf_low, hbuf_both;
             int sum_high, sum_low, sum;

             // get the index to use to expand the image
             lw = l_w[w];
             coefficients = _amemd8_const(&c_w[phase_offset_w*4]);
             hbuf_both = _mem4_const(&hbuf[lw-1]);

             hbuf_high = _unpkhu4(hbuf_both);
             hbuf_low  = _unpklu4(hbuf_both);

             sum_high = _dotp2(_hi(coefficients), hbuf_high);
             sum_low  = _dotp2(_lo(coefficients), hbuf_low);

             sum = (sum_high + sum_low) >> 12;

             if (++phase_offset_w >= max_phase)
                 phase_offset_w = 0;

             if ((lw + 2) >= in_width)
                 sum = hbuf[lw];

             *op_w++ = sum;
         }

         op += out_stride;
     }

     return 0;
 }

 void bicubic_scale_frame_c64(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
                              int new_width, int new_height)
 {

     dst->y_width = new_width;
     dst->y_height = new_height;
     dst->uv_width = new_width / 2;
     dst->uv_height = new_height / 2;

     dst->y_stride = dst->y_width;
     dst->uv_stride = dst->uv_width;

     bicubic_scale_c64(src->y_width, src->y_height, src->y_stride,
                       new_width, new_height, dst->y_stride,
                       src->y_buffer, dst->y_buffer);

     bicubic_scale_c64(src->uv_width, src->uv_height, src->uv_stride,
                       new_width / 2, new_height / 2, dst->uv_stride,
                       src->u_buffer, dst->u_buffer);

     bicubic_scale_c64(src->uv_width, src->uv_height, src->uv_stride,
                       new_width / 2, new_height / 2, dst->uv_stride,
                       src->v_buffer, dst->v_buffer);
 }
	/*
	* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
	*
	* Use of this source code is governed by a BSD-style license and patent
	* grant that can be found in the LICENSE file in the root of the source
	* tree. All contributing project authors may be found in the AUTHORS
	* file in the root of the source tree.
	*/


	#include <float.h>
	#include <math.h>
	#include <stdio.h>
	#include "vpx_mem/vpx_mem.h"
	#include "vpxscale_arbitrary.h"

	extern BICUBIC_SCALER_STRUCT g_b_scaler;

	int bicubic_scale_c64(int in_width, int in_height, int in_stride,
	int out_width, int out_height, int out_stride,
	unsigned char input_image, unsigned char output_image)
	{
	short restrict l_w, restrict l_h;
	short restrict c_w, restrict c_h;
	unsigned char restrict ip, restrict op, *restrict op_w;
	unsigned char *restrict hbuf;
	int h, w, lw, lh;
	int phase_offset_w, phase_offset_h;
	double coeff;
	int max_phase;

	c_w = g_b_scaler.c_w;
	c_h = g_b_scaler.c_h;

	op = output_image;

	l_w = g_b_scaler.l_w;
	l_h = g_b_scaler.l_h;

	phase_offset_h = 0;

	for (h = 0; h < out_height; h++)
	{
	// select the row to work on
	lh = l_h[h];
	ip = input_image + (in_stride * lh);

	coeff = _memd8_const(&c_h[phase_offset_h*4]);

	// vp8_filter the row vertically into an temporary buffer.
	// If the phase offset == 0 then all the multiplication
	// is going to result in the output equalling the input.
	// So instead point the temporary buffer to the input.
	// Also handle the boundry condition of not being able to
	// filter that last lines.
	if (phase_offset_h && (lh < in_height - 2))
	{
	hbuf = g_b_scaler.hbuf;

	for (w = 0; w < in_width; w += 4)
	{
	int ip1, ip2, ip3, ip4;
	int y13_12, y11_10, y23_22, y21_20, y33_32, y31_30, y43_42, y41_40;
	int y10_20, y11_21, y12_22, y13_23, y30_40, y31_41, y32_42, y33_43;
	int s1, s2, s3, s4;

	ip1 = _mem4_const(&ip[w - in_stride]);
	ip2 = _mem4_const(&ip[w]);
	ip3 = _mem4_const(&ip[w + in_stride]);
	ip4 = _mem4_const(&ip[w + 2*in_stride]);

	// realignment of data. Unpack the data so that it is in short
	// format instead of bytes.
	y13_12 = _unpkhu4(ip1);
	y11_10 = _unpklu4(ip1);
	y23_22 = _unpkhu4(ip2);
	y21_20 = _unpklu4(ip2);
	y33_32 = _unpkhu4(ip3);
	y31_30 = _unpklu4(ip3);
	y43_42 = _unpkhu4(ip4);
	y41_40 = _unpklu4(ip4);

	// repack the data so that elements 1 and 2 are together. this
	// lines up so that a dot product with the coefficients can be
	// done.
	y10_20 = _pack2(y11_10, y21_20);
	y11_21 = _packh2(y11_10, y21_20);
	y12_22 = _pack2(y13_12, y23_22);
	y13_23 = _packh2(y13_12, y23_22);

	s1 = _dotp2(_hi(coeff), y10_20);
	s2 = _dotp2(_hi(coeff), y11_21);
	s3 = _dotp2(_hi(coeff), y12_22);
	s4 = _dotp2(_hi(coeff), y13_23);

	y30_40 = _pack2(y31_30, y41_40);
	y31_41 = _packh2(y31_30, y41_40);
	y32_42 = _pack2(y33_32, y43_42);
	y33_43 = _packh2(y33_32, y43_42);

	// now repack elements 3 and 4 together.
	s1 += _dotp2(_lo(coeff), y30_40);
	s2 += _dotp2(_lo(coeff), y31_41);
	s3 += _dotp2(_lo(coeff), y32_42);
	s4 += _dotp2(_lo(coeff), y33_43);

	s1 = s1 >> 12;
	s2 = s2 >> 12;
	s3 = s3 >> 12;
	s4 = s4 >> 12;

	s1 = _pack2(s2, s1);
	s2 = _pack2(s4, s3);

	_amem4(&hbuf[w]) = _spacku4(s2, s1);
	}
	}
	else
	hbuf = ip;

	// increase the phase offset for the next time around.
	if (++phase_offset_h >= g_b_scaler.nh)
	phase_offset_h = 0;

	op_w = op;

	// will never be able to interpolate first pixel, so just copy it
	// over here.
	phase_offset_w = 1;
	*op_w++ = hbuf[0];

	if (1 >= g_b_scaler.nw) phase_offset_w = 0;

	max_phase = g_b_scaler.nw;

	for (w = 1; w < out_width; w++)
	{
	double coefficients;
	int hbuf_high, hbuf_low, hbuf_both;
	int sum_high, sum_low, sum;

	// get the index to use to expand the image
	lw = l_w[w];
	coefficients = _amemd8_const(&c_w[phase_offset_w*4]);
	hbuf_both = _mem4_const(&hbuf[lw-1]);

	hbuf_high = _unpkhu4(hbuf_both);
	hbuf_low = _unpklu4(hbuf_both);

	sum_high = _dotp2(_hi(coefficients), hbuf_high);
	sum_low = _dotp2(_lo(coefficients), hbuf_low);

	sum = (sum_high + sum_low) >> 12;

	if (++phase_offset_w >= max_phase)
	phase_offset_w = 0;

	if ((lw + 2) >= in_width)
	sum = hbuf[lw];

	*op_w++ = sum;
	}

	op += out_stride;
	}

	return 0;
	}

	void bicubic_scale_frame_c64(YV12_BUFFER_CONFIG src, YV12_BUFFER_CONFIG dst,
	int new_width, int new_height)
	{

	dst->y_width = new_width;
	dst->y_height = new_height;
	dst->uv_width = new_width / 2;
	dst->uv_height = new_height / 2;

	dst->y_stride = dst->y_width;
	dst->uv_stride = dst->uv_width;

	bicubic_scale_c64(src->y_width, src->y_height, src->y_stride,
	new_width, new_height, dst->y_stride,
	src->y_buffer, dst->y_buffer);

	bicubic_scale_c64(src->uv_width, src->uv_height, src->uv_stride,
	new_width / 2, new_height / 2, dst->uv_stride,
	src->u_buffer, dst->u_buffer);

	bicubic_scale_c64(src->uv_width, src->uv_height, src->uv_stride,
	new_width / 2, new_height / 2, dst->uv_stride,
	src->v_buffer, dst->v_buffer);
	}