blob: d2dfab0d75960d0562efc7c998c70fb6b2a48683 [file] [log] [blame]
/*
* Copyright (c) 2016, Intel Corporation
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the Intel Corporation nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
*
*/
/* HiFi3 optimized code parts for SRC */
#include <stdint.h>
#include <sof/alloc.h>
#include <sof/audio/format.h>
#include <sof/math/numbers.h>
#include "src_config.h"
#include "src.h"
#if SRC_HIFI3
#include <xtensa/config/defs.h>
#include <xtensa/tie/xt_hifi3.h>
/* HiFi3 has
* 16x 64 bit registers in register file AE_DR
*/
#if SRC_SHORT /* 16 bit coefficients version */
static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
const int taps_div_4, const int shift, const int nch)
{
/* This function uses
* 6x 64 bit registers
* 3x integers
* 5x address pointers,
*/
ae_f64 a0;
ae_f64 a1;
ae_valign u;
ae_f16x4 coef4;
ae_f32x2 d0;
ae_f32x2 d1;
ae_f32x2 data2;
ae_f16x4 *coefp;
ae_f32x2 *dp;
ae_f32 *dp0;
ae_f32 *dp1;
int i;
int j;
ae_f32 *wp = wp0;
const int inc = nch * sizeof(int32_t);
if (nch == 2) {
/* Move data pointer back by one sample to start from right
* channel sample. Discard read value p0.
*/
dp = (ae_f32x2 *)rp;
AE_L32_XC(d0, (ae_f32 *)dp, -sizeof(ae_f32));
/* Reset coefficient pointer and clear accumulator */
coefp = (ae_f16x4 *)cp;
a0 = AE_ZERO64();
a1 = AE_ZERO64();
/* Compute FIR filter for current channel with four
* taps per every loop iteration. Four coefficients
* are loaded simultaneously. Data is read
* from interleaved buffer with stride of channels
* count.
*/
for (i = 0; i < taps_div_4; i++) {
/* Load four coefficients */
AE_LA16X4_IP(coef4, u, coefp);
/* Load two data samples from two channels */
AE_L32X2_XC(d0, dp, inc); /* r0, l0 */
AE_L32X2_XC(d1, dp, inc); /* r1, l1 */
/* Select to data2 sequential samples from a channel
* and then accumulate to a0 and a1
* data2_h * coef4_3 + data2_l * coef4_2.
* The data is 32 bits Q1.31 and coefficient 16 bits
* Q1.15. The accumulators are Q17.47.
*/
data2 = AE_SEL32_LL(d0, d1); /* l0, l1 */
AE_MULAAFD32X16_H3_L2(a0, data2, coef4);
data2 = AE_SEL32_HH(d0, d1); /* r0, r1 */
AE_MULAAFD32X16_H3_L2(a1, data2, coef4);
/* Load two data samples from two channels */
AE_L32X2_XC(d0, dp, inc); /* r2, l2 */
AE_L32X2_XC(d1, dp, inc); /* r3, l3 */
/* Accumulate
* data2_h * coef4_1 + data2_l * coef4_0.
*/
data2 = AE_SEL32_LL(d0, d1); /* l2, l3 */
AE_MULAAFD32X16_H1_L0(a0, data2, coef4);
data2 = AE_SEL32_HH(d0, d1); /* r2, r3 */
AE_MULAAFD32X16_H1_L0(a1, data2, coef4);
}
/* Scale FIR output with right shifts, round/saturate
* to Q1.31, and store 32 bit output.
*/
AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
sizeof(int32_t));
AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp,
sizeof(int32_t));
return;
}
dp1 = (ae_f32 *)rp;
for (j = 0; j < nch; j++) {
/* Copy pointer and advance to next ch with dummy load */
dp0 = dp1;
AE_L32_XC(d0, dp1, -sizeof(ae_f32));
/* Reset coefficient pointer and clear accumulator */
coefp = (ae_f16x4 *)cp;
a0 = AE_ZERO64();
/* Compute FIR filter for current channel with four
* taps per every loop iteration. Data is read from
* interleaved buffer with stride of channels count.
*/
for (i = 0; i < taps_div_4; i++) {
/* Load four coefficients */
AE_LA16X4_IP(coef4, u, coefp);
/* Load two data samples, place to high and
* low of data2.
*/
AE_L32_XC(d0, dp0, inc);
AE_L32_XC(d1, dp0, inc);
data2 = AE_SEL32_LL(d0, d1);
/* Accumulate
* data2_h * coef4_3 + data2_l* coef4_2.
* The data is 32 bits Q1.31 and coefficient 16 bits
* Q1.15. The accumulator is Q17.47.
*/
AE_MULAAFD32X16_H3_L2(a0, data2, coef4);
/* Repeat with next two samples */
AE_L32_XC(d0, dp0, inc);
AE_L32_XC(d1, dp0, inc);
data2 = AE_SEL32_LL(d0, d1);
/* Accumulate
* data2_h * coef4_1 + data2_l * coef4_0.
*/
AE_MULAAFD32X16_H1_L0(a0, data2, coef4);
}
/* Scale FIR output with right shifts, round/saturate Q17.47
* to Q1.31, and store 32 bit output. Advance write
* pointer to next sample.
*/
AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
sizeof(int32_t));
}
}
#else /* 32bit coefficients version */
static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
const int taps_div_4, const int shift, const int nch)
{
/* This function uses
* 6x 64 bit registers
* 3x integers
* 5x address pointers,
*/
ae_f64 a0;
ae_f64 a1;
ae_f24x2 data2 = AE_ZERO24();
ae_f24x2 coef2 = AE_ZERO24();
ae_f24x2 d0 = AE_ZERO24();
ae_f24x2 d1 = AE_ZERO24();
ae_f24x2 *coefp;
ae_f24x2 *dp;
ae_f24 *dp1;
ae_f24 *dp0;
int i;
int j;
ae_f32 *wp = wp0;
const int inc = nch * sizeof(int32_t);
if (nch == 2) {
/* Move data pointer back by one sample to start from right
* channel sample. Discard read value p0.
*/
dp = (ae_f24x2 *)rp;
AE_L32F24_XC(d0, (ae_f24 *)dp, -sizeof(ae_f24));
/* Reset coefficient pointer and clear accumulator */
coefp = (ae_f24x2 *)cp;
a0 = AE_ZERO64();
a1 = AE_ZERO64();
/* Compute FIR filter for current channel with four
* taps per every loop iteration. Two coefficients
* are loaded simultaneously. Data is read
* from interleaved buffer with stride of channels
* count.
*/
for (i = 0; i < taps_div_4; i++) {
/* Load two coefficients. Coef2_h contains tap *coefp
* and coef2_l contains the next tap.
*/
/* TODO: Ensure coefficients are 64 bits aligned */
AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2));
/* Load two data samples from two channels */
AE_L32X2F24_XC(d0, dp, inc); /* r0, l0 */
AE_L32X2F24_XC(d1, dp, inc); /* r1, l1 */
/* Select to d0 successive left channel samples, to d1
* successive right channel samples. Then Accumulate
* to a0 and a1
* data2_h * coef2_h + data2_l * coef2_l. The Q1.31
* data and Q1.15 coefficients are used as 24 bits as
* Q1.23 values.
*/
data2 = AE_SELP24_LL(d0, d1);
AE_MULAAFP24S_HH_LL(a0, data2, coef2);
data2 = AE_SELP24_HH(d0, d1);
AE_MULAAFP24S_HH_LL(a1, data2, coef2);
/* Repeat for next two taps */
AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2));
AE_L32X2F24_XC(d0, dp, inc); /* r2, l2 */
AE_L32X2F24_XC(d1, dp, inc); /* r3, l3 */
data2 = AE_SELP24_LL(d0, d1);
AE_MULAAFP24S_HH_LL(a0, data2, coef2);
data2 = AE_SELP24_HH(d0, d1);
AE_MULAAFP24S_HH_LL(a1, data2, coef2);
}
/* Scale FIR output with right shifts, round/saturate
* to Q1.31, and store 32 bit output.
*/
AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
sizeof(int32_t));
AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp,
sizeof(int32_t));
return;
}
dp1 = (ae_f24 *)rp;
for (j = 0; j < nch; j++) {
/* Copy pointer and advance to next ch with dummy load */
dp0 = dp1;
AE_L32F24_XC(data2, dp1, -sizeof(ae_f24));
/* Reset coefficient pointer and clear accumulator */
coefp = (ae_f24x2 *)cp;
a0 = AE_ZERO64();
/* Compute FIR filter for current channel with four
* taps per every loop iteration. Data is read from
* interleaved buffer with stride of channels count.
*/
for (i = 0; i < taps_div_4; i++) {
/* Load two coefficients */
coef2 = *coefp++;
/* Load two data samples, place to high and
* low of data2.
*/
AE_L32F24_XC(d0, dp0, inc);
AE_L32F24_XC(d1, dp0, inc);
data2 = AE_SELP24_LL(d0, d1);
/* Accumulate to data2_h * coef2_h +
* data2_l*coef2_l. The Q1.31 bit data is used
* as Q1.23 from MSB side bits of the 32 bit
* word. The accumulator m is Q17.47.
*/
AE_MULAAFD24_HH_LL(a0, data2, coef2);
/* Repeat the same for next two filter taps */
coef2 = *coefp++;
AE_L32F24_XC(d0, dp0, inc);
AE_L32F24_XC(d1, dp0, inc);
data2 = AE_SELP24_LL(d0, d1);
AE_MULAAFD24_HH_LL(a0, data2, coef2);
}
/* Scale FIR output with right shifts, round/saturate Q17.47
* to Q1.31, and store 32 bit output. Advance write
* pointer to next sample.
*/
AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
sizeof(int32_t));
}
}
#endif /* 32bit coefficients version */
void src_polyphase_stage_cir(struct src_stage_prm *s)
{
/* This function uses
* 1x 64 bit registers
* 16x integers
* 7x address pointers,
*/
ae_int32x2 q = AE_ZERO32();
ae_f32 *rp;
ae_f32 *wp;
int i;
int n;
int m;
int n_wrap_buf;
int n_min;
struct src_state *fir = s->state;
struct src_stage *cfg = s->stage;
int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size];
int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size];
const void *cp; /* Can be int32_t or int16_t */
const size_t out_size = fir->out_delay_size * sizeof(int32_t);
const int nch = s->nch;
const int nch_x_odm = cfg->odm * nch;
const int blk_in_words = nch * cfg->blk_in;
const int blk_out_words = nch * cfg->num_of_subfilters;
const int sz = sizeof(int32_t);
const int n_sz = -sizeof(int32_t);
const int rewind_sz = sz * (nch * (cfg->blk_in
+ (cfg->num_of_subfilters - 1) * cfg->idm) - nch);
const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t);
const int taps_div_4 = cfg->subfilter_length >> 2;
#if SRC_SHORT
const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
#else
const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
#endif
for (n = 0; n < s->times; n++) {
/* Input data to filter */
m = blk_in_words;
/* Setup circular buffer for FIR input data delay */
AE_SETCBEGIN0(fir->fir_delay);
AE_SETCEND0(fir_end);
while (m > 0) {
/* Number of words until circular wrap */
n_wrap_buf = s->x_end_addr - s->x_rptr;
n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
m -= n_min;
for (i = 0; i < n_min; i++) {
/* Load 32 bits sample to accumulator,
* advance pointer.
*/
AE_L32_XP(q, (ae_int32 *)s->x_rptr, sz);
/* Store to circular buffer, advance pointer */
AE_S32_L_XC(q, (ae_int32 *)fir->fir_wp, n_sz);
}
/* Check for wrap */
src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
}
/* Do filter */
cp = cfg->coefs; /* Reset to 1st coefficient */
rp = (ae_f32 *)fir->fir_wp;
/* Do circular modification to pointer rp by amount of
* rewind to to data start. Loaded value q is discarded.
*/
AE_L32_XC(q, rp, rewind_sz);
/* Reset FIR write pointer and compute all polyphase
* sub-filters.
*/
wp = (ae_f32 *)fir->out_rp;
for (i = 0; i < cfg->num_of_subfilters; i++) {
fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
wp += nch_x_odm;
cp += subfilter_size;
src_circ_inc_wrap((int32_t **)&wp, out_delay_end,
out_size);
/* Circular advance pointer rp by number of
* channels x input delay multiplier. Loaded value q
* is discarded.
*/
AE_L32_XC(q, rp, nch_x_idm_sz);
}
/* Output */
/* Setup circular buffer for SRC out delay access */
AE_SETCBEGIN0(fir->out_delay);
AE_SETCEND0(out_delay_end);
m = blk_out_words;
while (m > 0) {
n_wrap_buf = s->y_end_addr - s->y_wptr;
n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
m -= n_min;
for (i = 0; i < n_min; i++) {
/* Circular load followed by linear store,
* advance read and write pointers.
*/
AE_L32_XC(q, (ae_int32 *)fir->out_rp, sz);
AE_S32_L_XP(q, (ae_int32 *)s->y_wptr, sz);
}
/* Check wrap */
src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
}
}
}
void src_polyphase_stage_cir_s24(struct src_stage_prm *s)
{
/* This function uses
* 1x 64 bit registers
* 16x integers
* 7x address pointers,
*/
ae_int32x2 q = AE_ZERO32();
ae_f32 *rp;
ae_f32 *wp;
int i;
int n;
int m;
int n_wrap_buf;
int n_min;
struct src_state *fir = s->state;
struct src_stage *cfg = s->stage;
int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size];
int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size];
const void *cp; /* Can be int32_t or int16_t */
const size_t out_size = fir->out_delay_size * sizeof(int32_t);
const int nch = s->nch;
const int nch_x_odm = cfg->odm * nch;
const int blk_in_words = nch * cfg->blk_in;
const int blk_out_words = nch * cfg->num_of_subfilters;
const int sz = sizeof(int32_t);
const int n_sz = -sizeof(int32_t);
const int rewind_sz = sz * (nch * (cfg->blk_in
+ (cfg->num_of_subfilters - 1) * cfg->idm) - nch);
const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t);
const int taps_div_4 = cfg->subfilter_length >> 2;
#if SRC_SHORT
const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
#else
const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
#endif
for (n = 0; n < s->times; n++) {
/* Input data */
m = blk_in_words;
/* Setup circular buffer for FIR input data delay */
AE_SETCBEGIN0(fir->fir_delay);
AE_SETCEND0(fir_end);
while (m > 0) {
/* Number of words without circular wrap */
n_wrap_buf = s->x_end_addr - s->x_rptr;
n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
m -= n_min;
for (i = 0; i < n_min; i++) {
/* Load 32 bits sample to accumulator
* and left shift by 8, advance read
* pointer.
*/
AE_L32_XP(q, (ae_int32 *)s->x_rptr, sz);
AE_S32_L_XC(AE_SLAI32(q, 8),
(ae_int32 *)fir->fir_wp, n_sz);
}
/* Check for wrap */
src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
}
/* Do filter */
cp = cfg->coefs; /* Reset to 1st coefficient */
rp = (ae_f32 *)fir->fir_wp;
/* Do circular modification to pointer rp by amount of
* rewind to to data start. Loaded value q is discarded.
*/
AE_L32_XC(q, rp, rewind_sz);
/* Reset FIR output write pointer and compute all polyphase
* sub-filters.
*/
wp = (ae_f32 *)fir->out_rp;
for (i = 0; i < cfg->num_of_subfilters; i++) {
fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
wp += nch_x_odm;
cp += subfilter_size;
src_circ_inc_wrap((int32_t **)&wp, out_delay_end,
out_size);
/* Circular advance pointer rp by number of
* channels x input delay multiplier. Loaded value q
* is discarded.
*/
AE_L32_XC(q, rp, nch_x_idm_sz);
}
/* Output */
/* Setup circular buffer for SRC out delay access */
AE_SETCBEGIN0(fir->out_delay);
AE_SETCEND0(out_delay_end);
m = blk_out_words;
while (m > 0) {
n_wrap_buf = s->y_end_addr - s->y_wptr;
n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
m -= n_min;
for (i = 0; i < n_min; i++) {
/* Circular load for 32 bit sample,
* advance read pointer.
*/
AE_L32_XC(q, (ae_int32 *)fir->out_rp, sz);
/* Store value as shifted right by 8
* for sign extended 24 bit value,
* advance write pointer.
*/
AE_S32_L_XP(AE_SRAI32(q, 8),
(ae_int32 *)s->y_wptr, sz);
}
/* Check wrap */
src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
}
}
}
#endif