src/audio/src_hifi2ep.c - chromiumos/third_party/sound-open-firmware - Git at Google

 /*
  * Copyright (c) 2017, Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *   * Redistributions of source code must retain the above copyright
  *     notice, this list of conditions and the following disclaimer.
  *   * Redistributions in binary form must reproduce the above copyright
  *     notice, this list of conditions and the following disclaimer in the
  *     documentation and/or other materials provided with the distribution.
  *   * Neither the name of the Intel Corporation nor the
  *     names of its contributors may be used to endorse or promote products
  *     derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
  *
  */

 /* HiFi EP optimized code parts for SRC */

 #include <stdint.h>
 #include <sof/alloc.h>
 #include <sof/audio/format.h>
 #include <sof/math/numbers.h>

 #include "src_config.h"
 #include "src.h"

 #if SRC_HIFIEP

 #include <xtensa/config/defs.h>
 #include <xtensa/tie/xt_hifi2.h>

 /* HiFi EP has
  * 4x 56 bit registers in register file Q
  * 8x 48 bit registers in register file P
  */

 #if SRC_SHORT /* 16 bit coefficients version */

 static inline void fir_filter(ae_q32s *rp, const void *cp, ae_q32s *wp0,
 	const int taps_div_4, const int shift, const int nch)
 {
 	/* This function uses
 	 * 2x 56 bit registers Q,
 	 * 4x 48 bit registers P
 	 * 3x integers
 	 * 4x address pointers,
 	 */
 	ae_q56s a0;
 	ae_q56s a1;
 	ae_p24x2f data2;
 	ae_p24x2f coef2;
 	ae_p24x2f p0;
 	ae_p24x2f p1;
 	ae_p16x2s *coefp;
 	ae_p24x2f *dp = (ae_p24x2f *)rp;
 	ae_p24x2f *dp0;
 	ae_q32s *wp = wp0;
 	int i;
 	int j;
 	const int inc = sizeof(ae_p24x2f);

 	/* 2ch FIR case */
 	if (nch == 2) {
 		/* Move data pointer back by one sample to start from right
 		 * channel sample. Discard read value p0.
 		 */
 		AE_LP24F_C(p0, dp, -sizeof(ae_p24f));

 		/* Reset coefficient pointer and clear accumulator */
 		coefp = (ae_p16x2s *)cp;
 		a0 = AE_ZEROQ56();
 		a1 = AE_ZEROQ56();

 		/* Compute FIR filter for current channel with four
 		 * taps per every loop iteration.  Two coefficients
 		 * are loaded simultaneously. Data is read
 		 * from interleaved buffer with stride of channels
 		 * count.
 		 */
 		for (i = 0; i < taps_div_4; i++) {
 			/* Load two coefficients. Coef2_h contains tap *coefp
 			 * and coef2_l contains the next tap.
 			 */
 			coef2 = AE_LP16X2F_I(coefp, 0);
 			coefp++;

 			/* Load two data samples from two channels */
 			AE_LP24X2F_C(p0, dp, inc); /* r0, l0 */
 			AE_LP24X2F_C(p1, dp, inc); /* r1, l1 */

 			/* Select to d0 successive left channel samples, to d1
 			 * successive right channel samples. Then accumulate
 			 * data2_h * coef2_h + data2_l * coef2_l. The Q1.31
 			 * data and Q1.15 coefficients are used as 24 bits as
 			 * Q1.23 values.
 			 */
 			data2 = AE_SELP24_LL(p0, p1);
 			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
 			data2 = AE_SELP24_HH(p0, p1);
 			AE_MULAAFP24S_HH_LL(a1, data2, coef2);

 			/* Repeat for next two taps */
 			coef2 = AE_LP16X2F_I(coefp, 0);
 			coefp++;
 			AE_LP24X2F_C(p0, dp, inc); /* r2, l2 */
 			AE_LP24X2F_C(p1, dp, inc); /* r3, l3 */
 			data2 = AE_SELP24_LL(p0, p1);
 			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
 			data2 = AE_SELP24_HH(p0, p1);
 			AE_MULAAFP24S_HH_LL(a1, data2, coef2);
 		}

 		/* Scale FIR output with right shifts, round/saturate
 		 * to Q1.31, and store 32 bit output.
 		 */
 		AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a0, shift)), wp, 0);
 		AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a1, shift)), wp,
 			sizeof(int32_t));
 		return;
 	}

 	for (j = 0; j < nch; j++) {
 		/* Copy pointer and advance to next ch with dummy load */
 		dp0 = dp;
 		AE_LP24F_C(p0, dp, -sizeof(ae_p24f));

 		/* Reset coefficient pointer and clear accumulator */
 		coefp = (ae_p16x2s *)cp;
 		a0 = AE_ZEROQ56();

 		/* Compute FIR filter for current channel with four
 		 * taps per every loop iteration.  Two coefficients
 		 * are loaded simultaneously. Data is read
 		 * from interleaved buffer with stride of channels
 		 * count.
 		 */
 		for (i = 0; i < taps_div_4; i++) {
 			/* Load two coefficients */
 			coef2 = *coefp++;

 			/* Load two data samples */
 			AE_LP24F_C(p0, dp0, inc);
 			AE_LP24F_C(p1, dp0, inc);

 			/* Pack p0 and p1 to data2_h and data2_l */
 			data2 = AE_SELP24_LL(p0, p1);

 			/* Accumulate data2_h * coef2_h + data2_l * coef2_l */
 			AE_MULAAFP24S_HH_LL(a0, data2, coef2);

 			/* Repeat for next two filter taps */
 			coef2 = *coefp++;
 			AE_LP24F_C(p0, dp0, inc);
 			AE_LP24F_C(p1, dp0, inc);
 			data2 = AE_SELP24_LL(p0, p1);
 			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
 		}

 		/* Scale FIR output with right shifts, round/saturate
 		 * to Q1.31, and store 32 bit output. Advance write
 		 * pointer to next sample.
 		 */
 		AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a0, shift)), wp, 0);
 		wp++;
 	}
 }

 #else /* 32bit coefficients version */

 static inline void fir_filter(ae_q32s *rp, const void *cp, ae_q32s *wp0,
 	const int taps_div_4, const int shift, const int nch)
 {
 	/* This function uses
 	 * 2x 56 bit registers Q,
 	 * 4x 48 bit registers P
 	 * 3x integers
 	 * 4x address pointers,
 	 */
 	ae_q56s a0;
 	ae_q56s a1;
 	ae_p24x2f p0;
 	ae_p24x2f p1;
 	ae_p24x2f data2;
 	ae_p24x2f coef2;
 	ae_p24x2f *coefp;
 	ae_p24x2f *dp = (ae_p24x2f *)rp;
 	ae_p24x2f *dp0;
 	ae_q32s *wp = wp0;
 	int i;
 	int j;
 	const int inc = sizeof(ae_p24x2f);

 	/* 2ch FIR case */
 	if (nch == 2) {
 		/* Move data pointer back by one sample to start from right
 		 * channel sample. Discard read value p0.
 		 */
 		AE_LP24F_C(p0, dp, -sizeof(ae_p24f));

 		/* Reset coefficient pointer and clear accumulator */
 		coefp = (ae_p24x2f *)cp;
 		a0 = AE_ZEROQ56();
 		a1 = AE_ZEROQ56();

 		/* Compute FIR filter for current channel with four
 		 * taps per every loop iteration.  Two coefficients
 		 * are loaded simultaneously. Data is read
 		 * from interleaved buffer with stride of channels
 		 * count.
 		 */
 		for (i = 0; i < taps_div_4; i++) {
 			/* Load two coefficients. Coef2_h contains tap *coefp
 			 * and coef2_l contains the next tap.
 			 */
 			/* TODO: Ensure coefficients are 64 bits aligned */
 			coef2 = AE_LP24X2F_I(coefp, 0);
 			coefp++;

 			/* Load two data samples from two channels */
 			AE_LP24X2F_C(p0, dp, inc); /* r0, l0 */
 			AE_LP24X2F_C(p1, dp, inc); /* r1, l1 */

 			/* Select to d0 successive left channel samples, to d1
 			 * successive right channel samples.
 			 */

 			/* Accumulate to m
 			 * data2_h * coef2_h + data2_l * coef2_l. The Q1.31
 			 * data and Q1.15 coefficients are used as 24 bits as
 			 * Q1.23 values.
 			 */
 			data2 = AE_SELP24_LL(p0, p1);
 			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
 			data2 = AE_SELP24_HH(p0, p1);
 			AE_MULAAFP24S_HH_LL(a1, data2, coef2);

 			/* Repeat for next two taps */
 			coef2 = AE_LP24X2F_I(coefp, 0);
 			coefp++;
 			AE_LP24X2F_C(p0, dp, inc); /* r2, l2 */
 			AE_LP24X2F_C(p1, dp, inc); /* r3, l3 */
 			data2 = AE_SELP24_LL(p0, p1);
 			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
 			data2 = AE_SELP24_HH(p0, p1);
 			AE_MULAAFP24S_HH_LL(a1, data2, coef2);
 		}

 		/* Scale FIR output with right shifts, round/saturate
 		 * to Q1.31, and store 32 bit output.
 		 */
 		AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a0, shift)), wp, 0);
 		AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a1, shift)), wp,
 			sizeof(int32_t));
 		return;
 	}

 	for (j = 0; j < nch; j++) {
 		/* Copy pointer and advance to next ch with dummy load */
 		dp0 = dp;
 		AE_LP24F_C(p0, dp, -sizeof(ae_p24f));

 		/* Reset coefficient pointer and clear accumulator */
 		coefp = (ae_p24x2f *)cp;
 		a0 = AE_ZEROQ56();

 		/* Compute FIR filter for current channel with four
 		 * taps per every loop iteration.  Two coefficients
 		 * are loaded simultaneously. Data is read
 		 * from interleaved buffer with stride of channels
 		 * count.
 		 */
 		for (i = 0; i < taps_div_4; i++) {
 			/* Load two coefficients */
 			coef2 = *coefp++;

 			/* Load two data samples and place them to L and H of
 			 * data2.
 			 */
 			AE_LP24F_C(p0, dp0, inc);
 			AE_LP24F_C(p1, dp0, inc);
 			data2 = AE_SELP24_LH(p0, p1);

 			/* Accumulate to m
 			 * data2_h * coef2_h + data2_l * coef2_l. The Q1.31
 			 * data and coefficients are used as the most
 			 * significant 24 bits as Q1.23 values.
 			 */
 			AE_MULAAFP24S_HH_LL(a0, data2, coef2);

 			/* Repeat for next two filter taps */
 			coef2 = *coefp++;
 			AE_LP24F_C(p0, dp0, inc);
 			AE_LP24F_C(p1, dp0, inc);
 			data2 = AE_SELP24_LH(p0, p1);
 			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
 		}

 		/* Scale FIR output with right shifts, round/saturate
 		 * to Q1.31, and store 32 bit output. Advance write
 		 * pointer to next sample.
 		 */
 		AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a0, shift)), wp, 0);
 		wp++;
 	}
 }
 #endif /* 32bit coefficients version */

 void src_polyphase_stage_cir(struct src_stage_prm *s)
 {
 	/* This function uses
 	 *  1x 56 bit registers Q,
 	 *  0x 48 bit registers P,
 	 * 16x integers
 	 *  7x address pointers,
 	 */
 	ae_q56s q;
 	ae_q32s *rp;
 	ae_q32s *wp;
 	int i;
 	int n;
 	int m;
 	int n_wrap_buf;
 	int n_min;
 	struct src_state *fir = s->state;
 	struct src_stage *cfg = s->stage;
 	int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size];
 	int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size];
 	const void *cp; /* Can be int32_t or int16_t */
 	const size_t out_size = fir->out_delay_size * sizeof(int32_t);
 	const int nch = s->nch;
 	const int nch_x_odm = cfg->odm * nch;
 	const int blk_in_words = nch * cfg->blk_in;
 	const int blk_out_words = nch * cfg->num_of_subfilters;
 	const int sz = sizeof(int32_t);
 	const int n_sz = -sizeof(int32_t);
 	const int rewind_sz = sz * (nch * (cfg->blk_in
 		+ (cfg->num_of_subfilters - 1) * cfg->idm) - nch);
 	const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t);
 	const int taps_div_4 = cfg->subfilter_length >> 2;

 #if SRC_SHORT
 	const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
 #else
 	const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
 #endif

 	for (n = 0; n < s->times; n++) {
 		/* Input data to filter */
 		m = blk_in_words;

 		/* Setup circular buffer for FIR input data delay */
 		AE_SETCBEGIN0(fir->fir_delay);
 		AE_SETCEND0(fir_end);

 		while (m > 0) {
 			/* Number of words until circular wrap */
 			n_wrap_buf = s->x_end_addr - s->x_rptr;
 			n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
 			m -= n_min;
 			for (i = 0; i < n_min; i++) {
 				/* Load 32 bits sample to accumulator */
 				q = AE_LQ32F_I((ae_q32s *)s->x_rptr++, 0);

 				/* Store to circular buffer, advance pointer */
 				AE_SQ32F_C(q, (ae_q32s *)fir->fir_wp, n_sz);
 			}

 			/* Check for wrap */
 			src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
 		}

 		/* Do filter */
 		cp = cfg->coefs; /* Reset to 1st coefficient */
 		rp = (ae_q32s *)fir->fir_wp;

 		/* Do circular modification to pointer rp by amount of
 		 * rewind to to data start. Loaded value q is discarded.
 		 */
 		AE_LQ32F_C(q, (ae_q32s *)rp, rewind_sz);

 		/* Reset FIR write pointer and compute all polyphase
 		 * sub-filters.
 		 */
 		wp = (ae_q32s *)fir->out_rp;
 		for (i = 0; i < cfg->num_of_subfilters; i++) {
 			fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
 			wp += nch_x_odm;
 			cp += subfilter_size;
 			src_circ_inc_wrap((int32_t **)&wp, out_delay_end,
 				out_size);

 			/* Circular advance pointer rp by number of
 			 * channels x input delay multiplier. Loaded value q
 			 * is discarded.
 			 */
 			AE_LQ32F_C(q, rp, nch_x_idm_sz);
 		}

 		/* Output */

 		/* Setup circular buffer for SRC out delay access */
 		AE_SETCBEGIN0(fir->out_delay);
 		AE_SETCEND0(out_delay_end);
 		m = blk_out_words;
 		while (m > 0) {
 			n_wrap_buf = s->y_end_addr - s->y_wptr;
 			n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
 			m -= n_min;
 			for (i = 0; i < n_min; i++) {
 				/* Circular load followed by linear store */
 				AE_LQ32F_C(q, (ae_q32s *)fir->out_rp, sz);
 				AE_SQ32F_I(q, (ae_q32s *)s->y_wptr, 0);
 				s->y_wptr++;
 			}
 			/* Check wrap */
 			src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
 		}
 	}
 }

 void src_polyphase_stage_cir_s24(struct src_stage_prm *s)
 {
 	/* This function uses
 	 *  1x 56 bit registers Q,
 	 *  0x 48 bit registers P,
 	 * 16x integers
 	 *  7x address pointers,
 	 */
 	ae_q56s q;
 	ae_q32s *rp;
 	ae_q32s *wp;
 	int i;
 	int n;
 	int m;
 	int n_wrap_buf;
 	int n_min;
 	struct src_state *fir = s->state;
 	struct src_stage *cfg = s->stage;
 	int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size];
 	int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size];
 	const void *cp; /* Can be int32_t or int16_t */
 	const size_t out_size = fir->out_delay_size * sizeof(int32_t);
 	const int nch = s->nch;
 	const int nch_x_odm = cfg->odm * nch;
 	const int blk_in_words = nch * cfg->blk_in;
 	const int blk_out_words = nch * cfg->num_of_subfilters;
 	const int sz = sizeof(int32_t);
 	const int n_sz = -sizeof(int32_t);
 	const int rewind_sz = sz * (nch * (cfg->blk_in
 		+ (cfg->num_of_subfilters - 1) * cfg->idm) - nch);
 	const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t);
 	const int taps_div_4 = cfg->subfilter_length >> 2;

 #if SRC_SHORT
 	const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
 #else
 	const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
 #endif

 	for (n = 0; n < s->times; n++) {
 		/* Input data to filter */
 		m = blk_in_words;

 		/* Setup circular buffer for FIR input data delay */
 		AE_SETCBEGIN0(fir->fir_delay);
 		AE_SETCEND0(fir_end);

 		while (m > 0) {
 			/* Number of words without circular wrap */
 			n_wrap_buf = s->x_end_addr - s->x_rptr;
 			n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
 			m -= n_min;
 			for (i = 0; i < n_min; i++) {
 				/* Load 32 bits sample to accumulator
 				 * and left shift by 8, advance read
 				 * pointer.
 				 */
 				q = AE_SLLIQ56(AE_LQ32F_I(
 					(ae_q32s *)s->x_rptr++, 0), 8);

 				/* Store to circular buffer, advance
 				 * write pointer.
 				 */
 				AE_SQ32F_C(q, (ae_q32s *)fir->fir_wp, n_sz);
 			}

 			/* Check for wrap */
 			src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
 		}

 		/* Do filter */
 		cp = cfg->coefs; /* Reset to 1st coefficient */
 		rp = (ae_q32s *)fir->fir_wp;

 		/* Do circular modification to pointer rp by amount of
 		 * rewind to to data start. Loaded value q is discarded.
 		 */
 		AE_LQ32F_C(q, (ae_q32s *)rp, rewind_sz);

 		/* Reset FIR output write pointer and compute all polyphase
 		 * sub-filters.
 		 */
 		wp = (ae_q32s *)fir->out_rp;
 		for (i = 0; i < cfg->num_of_subfilters; i++) {
 			fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
 			wp += nch_x_odm;
 			cp += subfilter_size;
 			src_circ_inc_wrap((int32_t **)&wp, out_delay_end,
 				out_size);

 			/* Circular advance pointer rp by number of
 			 * channels x input delay multiplier. Loaded value q
 			 * is discarded.
 			 */
 			AE_LQ32F_C(q, rp, nch_x_idm_sz);
 		}

 		/* Output */

 		/* Setup circular buffer for SRC out delay access */
 		AE_SETCBEGIN0(fir->out_delay);
 		AE_SETCEND0(out_delay_end);
 		m = blk_out_words;
 		while (m > 0) {
 			n_wrap_buf = s->y_end_addr - s->y_wptr;
 			n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
 			m -= n_min;
 			for (i = 0; i < n_min; i++) {
 				/* Circular load for 32 bit sample,
 				 * advance pointer.
 				 */
 				AE_LQ32F_C(q, (ae_q32s *)fir->out_rp, sz);

 				/* Store value as shifted right by 8 for
 				 * sign extended 24 bit value, advance pointer.
 				 */
 				AE_SQ32F_I(AE_SRAIQ56(q, 8),
 					   (ae_q32s *)s->y_wptr, 0);
 				s->y_wptr++;
 			}
 			/* Check wrap */
 			src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
 		}
 	}
 }

 #endif
	/*
	* Copyright (c) 2017, Intel Corporation
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the Intel Corporation nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*
	* Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
	*
	*/

	/* HiFi EP optimized code parts for SRC */

	#include <stdint.h>
	#include <sof/alloc.h>
	#include <sof/audio/format.h>
	#include <sof/math/numbers.h>

	#include "src_config.h"
	#include "src.h"

	#if SRC_HIFIEP

	#include <xtensa/config/defs.h>
	#include <xtensa/tie/xt_hifi2.h>

	/* HiFi EP has
	* 4x 56 bit registers in register file Q
	* 8x 48 bit registers in register file P
	*/

	#if SRC_SHORT /* 16 bit coefficients version */

	static inline void fir_filter(ae_q32s rp, const void cp, ae_q32s *wp0,
	const int taps_div_4, const int shift, const int nch)
	{
	/* This function uses
	* 2x 56 bit registers Q,
	* 4x 48 bit registers P
	* 3x integers
	* 4x address pointers,
	*/
	ae_q56s a0;
	ae_q56s a1;
	ae_p24x2f data2;
	ae_p24x2f coef2;
	ae_p24x2f p0;
	ae_p24x2f p1;
	ae_p16x2s *coefp;
	ae_p24x2f dp = (ae_p24x2f )rp;
	ae_p24x2f *dp0;
	ae_q32s *wp = wp0;
	int i;
	int j;
	const int inc = sizeof(ae_p24x2f);

	/* 2ch FIR case */
	if (nch == 2) {
	/* Move data pointer back by one sample to start from right
	* channel sample. Discard read value p0.
	*/
	AE_LP24F_C(p0, dp, -sizeof(ae_p24f));

	/* Reset coefficient pointer and clear accumulator */
	coefp = (ae_p16x2s *)cp;
	a0 = AE_ZEROQ56();
	a1 = AE_ZEROQ56();

	/* Compute FIR filter for current channel with four
	* taps per every loop iteration. Two coefficients
	* are loaded simultaneously. Data is read
	* from interleaved buffer with stride of channels
	* count.
	*/
	for (i = 0; i < taps_div_4; i++) {
	/* Load two coefficients. Coef2_h contains tap *coefp
	* and coef2_l contains the next tap.
	*/
	coef2 = AE_LP16X2F_I(coefp, 0);
	coefp++;

	/* Load two data samples from two channels */
	AE_LP24X2F_C(p0, dp, inc); /* r0, l0 */
	AE_LP24X2F_C(p1, dp, inc); /* r1, l1 */

	/* Select to d0 successive left channel samples, to d1
	* successive right channel samples. Then accumulate
	* data2_h * coef2_h + data2_l * coef2_l. The Q1.31
	* data and Q1.15 coefficients are used as 24 bits as
	* Q1.23 values.
	*/
	data2 = AE_SELP24_LL(p0, p1);
	AE_MULAAFP24S_HH_LL(a0, data2, coef2);
	data2 = AE_SELP24_HH(p0, p1);
	AE_MULAAFP24S_HH_LL(a1, data2, coef2);

	/* Repeat for next two taps */
	coef2 = AE_LP16X2F_I(coefp, 0);
	coefp++;
	AE_LP24X2F_C(p0, dp, inc); /* r2, l2 */
	AE_LP24X2F_C(p1, dp, inc); /* r3, l3 */
	data2 = AE_SELP24_LL(p0, p1);
	AE_MULAAFP24S_HH_LL(a0, data2, coef2);
	data2 = AE_SELP24_HH(p0, p1);
	AE_MULAAFP24S_HH_LL(a1, data2, coef2);
	}

	/* Scale FIR output with right shifts, round/saturate
	* to Q1.31, and store 32 bit output.
	*/
	AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a0, shift)), wp, 0);
	AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a1, shift)), wp,
	sizeof(int32_t));
	return;
	}

	for (j = 0; j < nch; j++) {
	/* Copy pointer and advance to next ch with dummy load */
	dp0 = dp;
	AE_LP24F_C(p0, dp, -sizeof(ae_p24f));

	/* Reset coefficient pointer and clear accumulator */
	coefp = (ae_p16x2s *)cp;
	a0 = AE_ZEROQ56();

	/* Compute FIR filter for current channel with four
	* taps per every loop iteration. Two coefficients
	* are loaded simultaneously. Data is read
	* from interleaved buffer with stride of channels
	* count.
	*/
	for (i = 0; i < taps_div_4; i++) {
	/* Load two coefficients */
	coef2 = *coefp++;

	/* Load two data samples */
	AE_LP24F_C(p0, dp0, inc);
	AE_LP24F_C(p1, dp0, inc);

	/* Pack p0 and p1 to data2_h and data2_l */
	data2 = AE_SELP24_LL(p0, p1);

	/* Accumulate data2_h * coef2_h + data2_l * coef2_l */
	AE_MULAAFP24S_HH_LL(a0, data2, coef2);

	/* Repeat for next two filter taps */
	coef2 = *coefp++;
	AE_LP24F_C(p0, dp0, inc);
	AE_LP24F_C(p1, dp0, inc);
	data2 = AE_SELP24_LL(p0, p1);
	AE_MULAAFP24S_HH_LL(a0, data2, coef2);
	}

	/* Scale FIR output with right shifts, round/saturate
	* to Q1.31, and store 32 bit output. Advance write
	* pointer to next sample.
	*/
	AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a0, shift)), wp, 0);
	wp++;
	}
	}

	#else /* 32bit coefficients version */

	static inline void fir_filter(ae_q32s rp, const void cp, ae_q32s *wp0,
	const int taps_div_4, const int shift, const int nch)
	{
	/* This function uses
	* 2x 56 bit registers Q,
	* 4x 48 bit registers P
	* 3x integers
	* 4x address pointers,
	*/
	ae_q56s a0;
	ae_q56s a1;
	ae_p24x2f p0;
	ae_p24x2f p1;
	ae_p24x2f data2;
	ae_p24x2f coef2;
	ae_p24x2f *coefp;
	ae_p24x2f dp = (ae_p24x2f )rp;
	ae_p24x2f *dp0;
	ae_q32s *wp = wp0;
	int i;
	int j;
	const int inc = sizeof(ae_p24x2f);

	/* 2ch FIR case */
	if (nch == 2) {
	/* Move data pointer back by one sample to start from right
	* channel sample. Discard read value p0.
	*/
	AE_LP24F_C(p0, dp, -sizeof(ae_p24f));

	/* Reset coefficient pointer and clear accumulator */
	coefp = (ae_p24x2f *)cp;
	a0 = AE_ZEROQ56();
	a1 = AE_ZEROQ56();

	/* Compute FIR filter for current channel with four
	* taps per every loop iteration. Two coefficients
	* are loaded simultaneously. Data is read
	* from interleaved buffer with stride of channels
	* count.
	*/
	for (i = 0; i < taps_div_4; i++) {
	/* Load two coefficients. Coef2_h contains tap *coefp
	* and coef2_l contains the next tap.
	*/
	/* TODO: Ensure coefficients are 64 bits aligned */
	coef2 = AE_LP24X2F_I(coefp, 0);
	coefp++;

	/* Load two data samples from two channels */
	AE_LP24X2F_C(p0, dp, inc); /* r0, l0 */
	AE_LP24X2F_C(p1, dp, inc); /* r1, l1 */

	/* Select to d0 successive left channel samples, to d1
	* successive right channel samples.
	*/

	/* Accumulate to m
	* data2_h * coef2_h + data2_l * coef2_l. The Q1.31
	* data and Q1.15 coefficients are used as 24 bits as
	* Q1.23 values.
	*/
	data2 = AE_SELP24_LL(p0, p1);
	AE_MULAAFP24S_HH_LL(a0, data2, coef2);
	data2 = AE_SELP24_HH(p0, p1);
	AE_MULAAFP24S_HH_LL(a1, data2, coef2);

	/* Repeat for next two taps */
	coef2 = AE_LP24X2F_I(coefp, 0);
	coefp++;
	AE_LP24X2F_C(p0, dp, inc); /* r2, l2 */
	AE_LP24X2F_C(p1, dp, inc); /* r3, l3 */
	data2 = AE_SELP24_LL(p0, p1);
	AE_MULAAFP24S_HH_LL(a0, data2, coef2);
	data2 = AE_SELP24_HH(p0, p1);
	AE_MULAAFP24S_HH_LL(a1, data2, coef2);
	}

	/* Scale FIR output with right shifts, round/saturate
	* to Q1.31, and store 32 bit output.
	*/
	AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a0, shift)), wp, 0);
	AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a1, shift)), wp,
	sizeof(int32_t));
	return;
	}

	for (j = 0; j < nch; j++) {
	/* Copy pointer and advance to next ch with dummy load */
	dp0 = dp;
	AE_LP24F_C(p0, dp, -sizeof(ae_p24f));

	/* Reset coefficient pointer and clear accumulator */
	coefp = (ae_p24x2f *)cp;
	a0 = AE_ZEROQ56();

	/* Compute FIR filter for current channel with four
	* taps per every loop iteration. Two coefficients
	* are loaded simultaneously. Data is read
	* from interleaved buffer with stride of channels
	* count.
	*/
	for (i = 0; i < taps_div_4; i++) {
	/* Load two coefficients */
	coef2 = *coefp++;

	/* Load two data samples and place them to L and H of
	* data2.
	*/
	AE_LP24F_C(p0, dp0, inc);
	AE_LP24F_C(p1, dp0, inc);
	data2 = AE_SELP24_LH(p0, p1);

	/* Accumulate to m
	* data2_h * coef2_h + data2_l * coef2_l. The Q1.31
	* data and coefficients are used as the most
	* significant 24 bits as Q1.23 values.
	*/
	AE_MULAAFP24S_HH_LL(a0, data2, coef2);

	/* Repeat for next two filter taps */
	coef2 = *coefp++;
	AE_LP24F_C(p0, dp0, inc);
	AE_LP24F_C(p1, dp0, inc);
	data2 = AE_SELP24_LH(p0, p1);
	AE_MULAAFP24S_HH_LL(a0, data2, coef2);
	}

	/* Scale FIR output with right shifts, round/saturate
	* to Q1.31, and store 32 bit output. Advance write
	* pointer to next sample.
	*/
	AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a0, shift)), wp, 0);
	wp++;
	}
	}
	#endif /* 32bit coefficients version */

	void src_polyphase_stage_cir(struct src_stage_prm *s)
	{
	/* This function uses
	* 1x 56 bit registers Q,
	* 0x 48 bit registers P,
	* 16x integers
	* 7x address pointers,
	*/
	ae_q56s q;
	ae_q32s *rp;
	ae_q32s *wp;
	int i;
	int n;
	int m;
	int n_wrap_buf;
	int n_min;
	struct src_state *fir = s->state;
	struct src_stage *cfg = s->stage;
	int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size];
	int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size];
	const void cp; / Can be int32_t or int16_t */
	const size_t out_size = fir->out_delay_size * sizeof(int32_t);
	const int nch = s->nch;
	const int nch_x_odm = cfg->odm * nch;
	const int blk_in_words = nch * cfg->blk_in;
	const int blk_out_words = nch * cfg->num_of_subfilters;
	const int sz = sizeof(int32_t);
	const int n_sz = -sizeof(int32_t);
	const int rewind_sz = sz * (nch * (cfg->blk_in
	+ (cfg->num_of_subfilters - 1) * cfg->idm) - nch);
	const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t);
	const int taps_div_4 = cfg->subfilter_length >> 2;

	#if SRC_SHORT
	const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
	#else
	const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
	#endif

	for (n = 0; n < s->times; n++) {
	/* Input data to filter */
	m = blk_in_words;

	/* Setup circular buffer for FIR input data delay */
	AE_SETCBEGIN0(fir->fir_delay);
	AE_SETCEND0(fir_end);

	while (m > 0) {
	/* Number of words until circular wrap */
	n_wrap_buf = s->x_end_addr - s->x_rptr;
	n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
	m -= n_min;
	for (i = 0; i < n_min; i++) {
	/* Load 32 bits sample to accumulator */
	q = AE_LQ32F_I((ae_q32s *)s->x_rptr++, 0);

	/* Store to circular buffer, advance pointer */
	AE_SQ32F_C(q, (ae_q32s *)fir->fir_wp, n_sz);
	}

	/* Check for wrap */
	src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
	}

	/* Do filter */
	cp = cfg->coefs; /* Reset to 1st coefficient */
	rp = (ae_q32s *)fir->fir_wp;

	/* Do circular modification to pointer rp by amount of
	* rewind to to data start. Loaded value q is discarded.
	*/
	AE_LQ32F_C(q, (ae_q32s *)rp, rewind_sz);

	/* Reset FIR write pointer and compute all polyphase
	* sub-filters.
	*/
	wp = (ae_q32s *)fir->out_rp;
	for (i = 0; i < cfg->num_of_subfilters; i++) {
	fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
	wp += nch_x_odm;
	cp += subfilter_size;
	src_circ_inc_wrap((int32_t **)&wp, out_delay_end,
	out_size);

	/* Circular advance pointer rp by number of
	* channels x input delay multiplier. Loaded value q
	* is discarded.
	*/
	AE_LQ32F_C(q, rp, nch_x_idm_sz);
	}

	/* Output */

	/* Setup circular buffer for SRC out delay access */
	AE_SETCBEGIN0(fir->out_delay);
	AE_SETCEND0(out_delay_end);
	m = blk_out_words;
	while (m > 0) {
	n_wrap_buf = s->y_end_addr - s->y_wptr;
	n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
	m -= n_min;
	for (i = 0; i < n_min; i++) {
	/* Circular load followed by linear store */
	AE_LQ32F_C(q, (ae_q32s *)fir->out_rp, sz);
	AE_SQ32F_I(q, (ae_q32s *)s->y_wptr, 0);
	s->y_wptr++;
	}
	/* Check wrap */
	src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
	}
	}
	}

	void src_polyphase_stage_cir_s24(struct src_stage_prm *s)
	{
	/* This function uses
	* 1x 56 bit registers Q,
	* 0x 48 bit registers P,
	* 16x integers
	* 7x address pointers,
	*/
	ae_q56s q;
	ae_q32s *rp;
	ae_q32s *wp;
	int i;
	int n;
	int m;
	int n_wrap_buf;
	int n_min;
	struct src_state *fir = s->state;
	struct src_stage *cfg = s->stage;
	int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size];
	int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size];
	const void cp; / Can be int32_t or int16_t */
	const size_t out_size = fir->out_delay_size * sizeof(int32_t);
	const int nch = s->nch;
	const int nch_x_odm = cfg->odm * nch;
	const int blk_in_words = nch * cfg->blk_in;
	const int blk_out_words = nch * cfg->num_of_subfilters;
	const int sz = sizeof(int32_t);
	const int n_sz = -sizeof(int32_t);
	const int rewind_sz = sz * (nch * (cfg->blk_in
	+ (cfg->num_of_subfilters - 1) * cfg->idm) - nch);
	const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t);
	const int taps_div_4 = cfg->subfilter_length >> 2;

	#if SRC_SHORT
	const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
	#else
	const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
	#endif

	for (n = 0; n < s->times; n++) {
	/* Input data to filter */
	m = blk_in_words;

	/* Setup circular buffer for FIR input data delay */
	AE_SETCBEGIN0(fir->fir_delay);
	AE_SETCEND0(fir_end);

	while (m > 0) {
	/* Number of words without circular wrap */
	n_wrap_buf = s->x_end_addr - s->x_rptr;
	n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
	m -= n_min;
	for (i = 0; i < n_min; i++) {
	/* Load 32 bits sample to accumulator
	* and left shift by 8, advance read
	* pointer.
	*/
	q = AE_SLLIQ56(AE_LQ32F_I(
	(ae_q32s *)s->x_rptr++, 0), 8);

	/* Store to circular buffer, advance
	* write pointer.
	*/
	AE_SQ32F_C(q, (ae_q32s *)fir->fir_wp, n_sz);
	}

	/* Check for wrap */
	src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
	}

	/* Do filter */
	cp = cfg->coefs; /* Reset to 1st coefficient */
	rp = (ae_q32s *)fir->fir_wp;

	/* Do circular modification to pointer rp by amount of
	* rewind to to data start. Loaded value q is discarded.
	*/
	AE_LQ32F_C(q, (ae_q32s *)rp, rewind_sz);

	/* Reset FIR output write pointer and compute all polyphase
	* sub-filters.
	*/
	wp = (ae_q32s *)fir->out_rp;
	for (i = 0; i < cfg->num_of_subfilters; i++) {
	fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
	wp += nch_x_odm;
	cp += subfilter_size;
	src_circ_inc_wrap((int32_t **)&wp, out_delay_end,
	out_size);

	/* Circular advance pointer rp by number of
	* channels x input delay multiplier. Loaded value q
	* is discarded.
	*/
	AE_LQ32F_C(q, rp, nch_x_idm_sz);
	}

	/* Output */

	/* Setup circular buffer for SRC out delay access */
	AE_SETCBEGIN0(fir->out_delay);
	AE_SETCEND0(out_delay_end);
	m = blk_out_words;
	while (m > 0) {
	n_wrap_buf = s->y_end_addr - s->y_wptr;
	n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
	m -= n_min;
	for (i = 0; i < n_min; i++) {
	/* Circular load for 32 bit sample,
	* advance pointer.
	*/
	AE_LQ32F_C(q, (ae_q32s *)fir->out_rp, sz);

	/* Store value as shifted right by 8 for
	* sign extended 24 bit value, advance pointer.
	*/
	AE_SQ32F_I(AE_SRAIQ56(q, 8),
	(ae_q32s *)s->y_wptr, 0);
	s->y_wptr++;
	}
	/* Check wrap */
	src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
	}
	}
	}

	#endif