src/audio/fir_hifi3.h - chromiumos/third_party/sound-open-firmware - Git at Google

 /*
  * Copyright (c) 2017, Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *   * Redistributions of source code must retain the above copyright
  *     notice, this list of conditions and the following disclaimer.
  *   * Redistributions in binary form must reproduce the above copyright
  *     notice, this list of conditions and the following disclaimer in the
  *     documentation and/or other materials provided with the distribution.
  *   * Neither the name of the Intel Corporation nor the
  *     names of its contributors may be used to endorse or promote products
  *     derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
  */

 #ifndef FIR_HIFI3_H
 #define FIR_HIFI3_H

 #include "fir_config.h"

 #if FIR_HIFI3

 #include <xtensa/config/defs.h>
 #include <xtensa/tie/xt_hifi2.h>
 #include <sof/audio/format.h>

 struct fir_state_32x16 {
 	ae_int32 *rwp; /* Circular read and write pointer */
 	ae_int32 *delay; /* Pointer to FIR delay line */
 	ae_int32 *delay_end; /* Pointer to FIR delay line end */
 	ae_f16x4 *coef; /* Pointer to FIR coefficients */
 	int mute; /* Set to 1 to mute EQ output, 0 otherwise */
 	int taps; /* Number of FIR taps */
 	int length; /* Number of FIR taps plus input length (even) */
 	int in_shift; /* Amount of right shifts at input */
 	int out_shift; /* Amount of right shifts at output */
 };

 void fir_reset(struct fir_state_32x16 *fir);

 int fir_init_coef(struct fir_state_32x16 *fir, int16_t config[]);

 void fir_init_delay(struct fir_state_32x16 *fir, int32_t **data);

 void eq_fir_2x_s32_hifi3(struct fir_state_32x16 fir[],
 			 struct comp_buffer *source, struct comp_buffer *sink,
 			 int frames, int nch);

 void eq_fir_s32_hifi3(struct fir_state_32x16 fir[], struct comp_buffer *source,
 		      struct comp_buffer *sink, int frames, int nch);

 /* The next trivial functions are inlined */

 static inline void fir_mute(struct fir_state_32x16 *fir)
 {
 	fir->mute = 1;
 }

 static inline void fir_unmute(struct fir_state_32x16 *fir)
 {
 	fir->mute = 0;
 }

 /* Setup circular buffer for FIR input data delay */
 static inline void fir_hifi3_setup_circular(struct fir_state_32x16 *fir)
 {
 	AE_SETCBEGIN0(fir->delay);
 	AE_SETCEND0(fir->delay_end);
 }

 void fir_get_lrshifts(struct fir_state_32x16 *fir, int *lshift,
 		      int *rshift);

 /* The next functions are inlined to optmize execution speed */

 /* HiFi EP has the follow number of reqisters that should not be exceeded
  * 4x 56 bit registers in register file Q
  * 8x 48 bit registers in register file P
  */

 static inline void fir_32x16_hifi3(struct fir_state_32x16 *fir, int32_t *x,
 				   int32_t *y, int shift)
 {
 	/* This function uses
 	 * 1x 56 bit registers Q,
 	 * 4x 48 bit registers P
 	 * 3x integers
 	 * 2x address pointers,
 	 */
 	ae_f64 a;
 	ae_valign u;
 	ae_f32x2 data2;
 	ae_f16x4 coefs;
 	ae_f32x2 d0;
 	ae_f32x2 d1;
 	int i;
 	ae_int32 *dp = fir->rwp;
 	ae_int16x4 *coefp = (ae_int16x4 *)fir->coef;
 	const int taps_div_4 = fir->taps >> 2;
 	const int inc = sizeof(int32_t);

 	/* Write sample to delay */
 	AE_S32_L_XC((ae_int32)*x, fir->rwp, -sizeof(int32_t));

 	/* Prime the coefficients stream */
 	u = AE_LA64_PP(coefp);

 	/* Note: If the next function is converted to handle two samples
 	 * per call the data load can be done with single instruction
 	 * AE_LP24X2F_C(data2, dp, sizeof(ae_p24x2f));
 	 */
 	a = AE_ZEROQ56();
 	for (i = 0; i < taps_div_4; i++) {
 		/* Load four coefficients. Coef_3 contains tap h[n],
 		 * coef_2 contains h[n+1], coef_1 contains h[n+2], and
 		 * coef_0 contains h[n+3];
 		 */
 		AE_LA16X4_IP(coefs, u, coefp);

 		/* Load two data samples and pack to d0 to data2_h and
 		 * d1 to data2_l.
 		 */
 		AE_L32_XC(d0, dp, inc);
 		AE_L32_XC(d1, dp, inc);
 		data2 = AE_SEL32_LL(d0, d1);

 		/* Accumulate
 		 * a += data2_h * coefs_3 + data2_l * coefs_2. The Q1.31
 		 * data and Q1.15 coefficients are used as 24 bits as
 		 * Q1.23 values.
 		 */
 		AE_MULAAFD32X16_H3_L2(a, data2, coefs);

 		/* Repeat the same for next two taps and increase coefp.
 		 * a += data2_h * coefs_1 + data2_l * coefs_0.
 		 */
 		AE_L32_XC(d0, dp, inc);
 		AE_L32_XC(d1, dp, inc);
 		data2 = AE_SEL32_LL(d0, d1);
 		AE_MULAAFD32X16_H1_L0(a, data2, coefs);
 		//coefp += 4;
 	}

 	/* Do scaling shifts and store sample. */
 	a = AE_SLAA64S(a, shift);
 	AE_S32_L_I(AE_ROUND32F48SSYM(a), (ae_int32 *)y, 0);
 }

 /* HiFi EP has the follow number of reqisters that should not be exceeded
  * 4x 56 bit registers in register file Q
  * 8x 48 bit registers in register file P
  */

 static inline void fir_32x16_2x_hifi3(struct fir_state_32x16 *fir, int32_t *x0,
 				      int32_t *x1, int32_t *y0, int32_t *y1,
 				      int shift)
 {
 	/* This function uses
 	 * 2x 56 bit registers Q,
 	 * 4x 48 bit registers P
 	 * 3x integers
 	 * 2x address pointers,
 	 */
 	ae_f64 a;
 	ae_f64 b;
 	ae_valign u;
 	ae_f32x2 d0;
 	ae_f32x2 d1;
 	ae_f16x4 coefs;
 	int i;
 	ae_f32x2 *dp;
 	ae_f16x4 *coefp = fir->coef;
 	const int taps_div_4 = fir->taps >> 2;
 	const int inc = 2 * sizeof(int32_t);

 	/* Write samples to delay */
 	AE_S32_L_XC((ae_int32)*x0, fir->rwp, -sizeof(int32_t));
 	dp = (ae_f32x2 *)fir->rwp;
 	AE_S32_L_XC((ae_int32)*x1, fir->rwp, -sizeof(int32_t));

 	/* Note: If the next function is converted to handle two samples
 	 * per call the data load can be done with single instruction
 	 * AE_LP24X2F_C(data2, dp, sizeof(ae_p24x2f));
 	 */
 	a = AE_ZERO64();
 	b = AE_ZERO64();

 	/* Prime the coefficients stream */
 	u = AE_LA64_PP(coefp);

 	/* Load two data samples and pack to d0 to data2_h and
 	 * d1 to data2_l.
 	 */
 	AE_L32X2_XC(d0, dp, inc);
 	for (i = 0; i < taps_div_4; i++) {
 		/* Load four coefficients. Coef_3 contains tap h[n],
 		 * coef_2 contains h[n+1], coef_1 contains h[n+2], and
 		 * coef_0 contains h[n+3];
 		 */
 		AE_LA16X4_IP(coefs, u, coefp);

 		/* Load two data samples. Upper part d1_h is x[n+1] and
 		 * lower part d1_l is x[n].
 		 */
 		AE_L32X2_XC(d1, dp, inc);

 		/* Quad MAC (HH)
 		 * b += d0_h * coefs_3 + d0_l * coefs_2
 		 * a += d0_l * coefs_3 + d1_h * coefs_2
 		 */
 		AE_MULAFD32X16X2_FIR_HH(b, a, d0, d1, coefs);
 		d0 = d1;

 		/* Repeat the same for next two taps and increase coefp. */
 		AE_L32X2_XC(d1, dp, inc);

 		/* Quad MAC (HL)
 		 * b += d0_h * coefs_1 + d0_l * coefs_0
 		 * a += d0_l * coefs_1 + d1_h * coefs_0
 		 */
 		AE_MULAFD32X16X2_FIR_HL(b, a, d0, d1, coefs);
 		d0 = d1;
 	}

 	/* Do scaling shifts and store sample. */
 	b = AE_SLAA64S(b, shift);
 	a = AE_SLAA64S(a, shift);
 	AE_S32_L_I(AE_ROUND32F48SSYM(b), (ae_int32 *)y1, 0);
 	AE_S32_L_I(AE_ROUND32F48SSYM(a), (ae_int32 *)y0, 0);
 }

 #endif
 #endif
	/*
	* Copyright (c) 2017, Intel Corporation
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the Intel Corporation nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*
	* Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
	*/

	#ifndef FIR_HIFI3_H
	#define FIR_HIFI3_H

	#include "fir_config.h"

	#if FIR_HIFI3

	#include <xtensa/config/defs.h>
	#include <xtensa/tie/xt_hifi2.h>
	#include <sof/audio/format.h>

	struct fir_state_32x16 {
	ae_int32 rwp; / Circular read and write pointer */
	ae_int32 delay; / Pointer to FIR delay line */
	ae_int32 delay_end; / Pointer to FIR delay line end */
	ae_f16x4 coef; / Pointer to FIR coefficients */
	int mute; /* Set to 1 to mute EQ output, 0 otherwise */
	int taps; /* Number of FIR taps */
	int length; /* Number of FIR taps plus input length (even) */
	int in_shift; /* Amount of right shifts at input */
	int out_shift; /* Amount of right shifts at output */
	};

	void fir_reset(struct fir_state_32x16 *fir);

	int fir_init_coef(struct fir_state_32x16 *fir, int16_t config[]);

	void fir_init_delay(struct fir_state_32x16 fir, int32_t *data);

	void eq_fir_2x_s32_hifi3(struct fir_state_32x16 fir[],
	struct comp_buffer source, struct comp_buffer sink,
	int frames, int nch);

	void eq_fir_s32_hifi3(struct fir_state_32x16 fir[], struct comp_buffer *source,
	struct comp_buffer *sink, int frames, int nch);

	/* The next trivial functions are inlined */

	static inline void fir_mute(struct fir_state_32x16 *fir)
	{
	fir->mute = 1;
	}

	static inline void fir_unmute(struct fir_state_32x16 *fir)
	{
	fir->mute = 0;
	}

	/* Setup circular buffer for FIR input data delay */
	static inline void fir_hifi3_setup_circular(struct fir_state_32x16 *fir)
	{
	AE_SETCBEGIN0(fir->delay);
	AE_SETCEND0(fir->delay_end);
	}

	void fir_get_lrshifts(struct fir_state_32x16 fir, int lshift,
	int *rshift);

	/* The next functions are inlined to optmize execution speed */

	/* HiFi EP has the follow number of reqisters that should not be exceeded
	* 4x 56 bit registers in register file Q
	* 8x 48 bit registers in register file P
	*/

	static inline void fir_32x16_hifi3(struct fir_state_32x16 fir, int32_t x,
	int32_t *y, int shift)
	{
	/* This function uses
	* 1x 56 bit registers Q,
	* 4x 48 bit registers P
	* 3x integers
	* 2x address pointers,
	*/
	ae_f64 a;
	ae_valign u;
	ae_f32x2 data2;
	ae_f16x4 coefs;
	ae_f32x2 d0;
	ae_f32x2 d1;
	int i;
	ae_int32 *dp = fir->rwp;
	ae_int16x4 coefp = (ae_int16x4 )fir->coef;
	const int taps_div_4 = fir->taps >> 2;
	const int inc = sizeof(int32_t);

	/* Write sample to delay */
	AE_S32_L_XC((ae_int32)*x, fir->rwp, -sizeof(int32_t));

	/* Prime the coefficients stream */
	u = AE_LA64_PP(coefp);

	/* Note: If the next function is converted to handle two samples
	* per call the data load can be done with single instruction
	* AE_LP24X2F_C(data2, dp, sizeof(ae_p24x2f));
	*/
	a = AE_ZEROQ56();
	for (i = 0; i < taps_div_4; i++) {
	/* Load four coefficients. Coef_3 contains tap h[n],
	* coef_2 contains h[n+1], coef_1 contains h[n+2], and
	* coef_0 contains h[n+3];
	*/
	AE_LA16X4_IP(coefs, u, coefp);

	/* Load two data samples and pack to d0 to data2_h and
	* d1 to data2_l.
	*/
	AE_L32_XC(d0, dp, inc);
	AE_L32_XC(d1, dp, inc);
	data2 = AE_SEL32_LL(d0, d1);

	/* Accumulate
	* a += data2_h * coefs_3 + data2_l * coefs_2. The Q1.31
	* data and Q1.15 coefficients are used as 24 bits as
	* Q1.23 values.
	*/
	AE_MULAAFD32X16_H3_L2(a, data2, coefs);

	/* Repeat the same for next two taps and increase coefp.
	* a += data2_h * coefs_1 + data2_l * coefs_0.
	*/
	AE_L32_XC(d0, dp, inc);
	AE_L32_XC(d1, dp, inc);
	data2 = AE_SEL32_LL(d0, d1);
	AE_MULAAFD32X16_H1_L0(a, data2, coefs);
	//coefp += 4;
	}

	/* Do scaling shifts and store sample. */
	a = AE_SLAA64S(a, shift);
	AE_S32_L_I(AE_ROUND32F48SSYM(a), (ae_int32 *)y, 0);
	}

	/* HiFi EP has the follow number of reqisters that should not be exceeded
	* 4x 56 bit registers in register file Q
	* 8x 48 bit registers in register file P
	*/

	static inline void fir_32x16_2x_hifi3(struct fir_state_32x16 fir, int32_t x0,
	int32_t x1, int32_t y0, int32_t *y1,
	int shift)
	{
	/* This function uses
	* 2x 56 bit registers Q,
	* 4x 48 bit registers P
	* 3x integers
	* 2x address pointers,
	*/
	ae_f64 a;
	ae_f64 b;
	ae_valign u;
	ae_f32x2 d0;
	ae_f32x2 d1;
	ae_f16x4 coefs;
	int i;
	ae_f32x2 *dp;
	ae_f16x4 *coefp = fir->coef;
	const int taps_div_4 = fir->taps >> 2;
	const int inc = 2 * sizeof(int32_t);

	/* Write samples to delay */
	AE_S32_L_XC((ae_int32)*x0, fir->rwp, -sizeof(int32_t));
	dp = (ae_f32x2 *)fir->rwp;
	AE_S32_L_XC((ae_int32)*x1, fir->rwp, -sizeof(int32_t));

	/* Note: If the next function is converted to handle two samples
	* per call the data load can be done with single instruction
	* AE_LP24X2F_C(data2, dp, sizeof(ae_p24x2f));
	*/
	a = AE_ZERO64();
	b = AE_ZERO64();

	/* Prime the coefficients stream */
	u = AE_LA64_PP(coefp);

	/* Load two data samples and pack to d0 to data2_h and
	* d1 to data2_l.
	*/
	AE_L32X2_XC(d0, dp, inc);
	for (i = 0; i < taps_div_4; i++) {
	/* Load four coefficients. Coef_3 contains tap h[n],
	* coef_2 contains h[n+1], coef_1 contains h[n+2], and
	* coef_0 contains h[n+3];
	*/
	AE_LA16X4_IP(coefs, u, coefp);

	/* Load two data samples. Upper part d1_h is x[n+1] and
	* lower part d1_l is x[n].
	*/
	AE_L32X2_XC(d1, dp, inc);

	/* Quad MAC (HH)
	* b += d0_h * coefs_3 + d0_l * coefs_2
	* a += d0_l * coefs_3 + d1_h * coefs_2
	*/
	AE_MULAFD32X16X2_FIR_HH(b, a, d0, d1, coefs);
	d0 = d1;

	/* Repeat the same for next two taps and increase coefp. */
	AE_L32X2_XC(d1, dp, inc);

	/* Quad MAC (HL)
	* b += d0_h * coefs_1 + d0_l * coefs_0
	* a += d0_l * coefs_1 + d1_h * coefs_0
	*/
	AE_MULAFD32X16X2_FIR_HL(b, a, d0, d1, coefs);
	d0 = d1;
	}

	/* Do scaling shifts and store sample. */
	b = AE_SLAA64S(b, shift);
	a = AE_SLAA64S(a, shift);
	AE_S32_L_I(AE_ROUND32F48SSYM(b), (ae_int32 *)y1, 0);
	AE_S32_L_I(AE_ROUND32F48SSYM(a), (ae_int32 *)y0, 0);
	}

	#endif
	#endif