| /* |
| * Copyright (c) 2016, Intel Corporation |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * * Neither the name of the Intel Corporation nor the |
| * names of its contributors may be used to endorse or promote products |
| * derived from this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| * |
| * Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com> |
| * |
| */ |
| |
| /* HiFi3 optimized code parts for SRC */ |
| |
| #include <stdint.h> |
| #include <sof/alloc.h> |
| #include <sof/audio/format.h> |
| #include <sof/math/numbers.h> |
| |
| #include "src_config.h" |
| #include "src.h" |
| |
| #if SRC_HIFI3 |
| |
| #include <xtensa/config/defs.h> |
| #include <xtensa/tie/xt_hifi3.h> |
| |
| /* HiFi3 has |
| * 16x 64 bit registers in register file AE_DR |
| */ |
| |
| #if SRC_SHORT /* 16 bit coefficients version */ |
| |
| static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0, |
| const int taps_div_4, const int shift, const int nch) |
| { |
| /* This function uses |
| * 6x 64 bit registers |
| * 3x integers |
| * 5x address pointers, |
| */ |
| ae_f64 a0; |
| ae_f64 a1; |
| ae_valign u; |
| ae_f16x4 coef4; |
| ae_f32x2 d0; |
| ae_f32x2 d1; |
| ae_f32x2 data2; |
| ae_f16x4 *coefp; |
| ae_f32x2 *dp; |
| ae_f32 *dp0; |
| ae_f32 *dp1; |
| int i; |
| int j; |
| ae_f32 *wp = wp0; |
| const int inc = nch * sizeof(int32_t); |
| |
| if (nch == 2) { |
| /* Move data pointer back by one sample to start from right |
| * channel sample. Discard read value p0. |
| */ |
| dp = (ae_f32x2 *)rp; |
| AE_L32_XC(d0, (ae_f32 *)dp, -sizeof(ae_f32)); |
| |
| /* Reset coefficient pointer and clear accumulator */ |
| coefp = (ae_f16x4 *)cp; |
| a0 = AE_ZERO64(); |
| a1 = AE_ZERO64(); |
| |
| /* Compute FIR filter for current channel with four |
| * taps per every loop iteration. Four coefficients |
| * are loaded simultaneously. Data is read |
| * from interleaved buffer with stride of channels |
| * count. |
| */ |
| for (i = 0; i < taps_div_4; i++) { |
| /* Load four coefficients */ |
| AE_LA16X4_IP(coef4, u, coefp); |
| |
| /* Load two data samples from two channels */ |
| AE_L32X2_XC(d0, dp, inc); /* r0, l0 */ |
| AE_L32X2_XC(d1, dp, inc); /* r1, l1 */ |
| |
| /* Select to data2 sequential samples from a channel |
| * and then accumulate to a0 and a1 |
| * data2_h * coef4_3 + data2_l * coef4_2. |
| * The data is 32 bits Q1.31 and coefficient 16 bits |
| * Q1.15. The accumulators are Q17.47. |
| */ |
| data2 = AE_SEL32_LL(d0, d1); /* l0, l1 */ |
| AE_MULAAFD32X16_H3_L2(a0, data2, coef4); |
| data2 = AE_SEL32_HH(d0, d1); /* r0, r1 */ |
| AE_MULAAFD32X16_H3_L2(a1, data2, coef4); |
| |
| /* Load two data samples from two channels */ |
| AE_L32X2_XC(d0, dp, inc); /* r2, l2 */ |
| AE_L32X2_XC(d1, dp, inc); /* r3, l3 */ |
| |
| /* Accumulate |
| * data2_h * coef4_1 + data2_l * coef4_0. |
| */ |
| data2 = AE_SEL32_LL(d0, d1); /* l2, l3 */ |
| AE_MULAAFD32X16_H1_L0(a0, data2, coef4); |
| data2 = AE_SEL32_HH(d0, d1); /* r2, r3 */ |
| AE_MULAAFD32X16_H1_L0(a1, data2, coef4); |
| } |
| |
| /* Scale FIR output with right shifts, round/saturate |
| * to Q1.31, and store 32 bit output. |
| */ |
| AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp, |
| sizeof(int32_t)); |
| AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp, |
| sizeof(int32_t)); |
| |
| return; |
| } |
| |
| dp1 = (ae_f32 *)rp; |
| for (j = 0; j < nch; j++) { |
| /* Copy pointer and advance to next ch with dummy load */ |
| dp0 = dp1; |
| AE_L32_XC(d0, dp1, -sizeof(ae_f32)); |
| |
| /* Reset coefficient pointer and clear accumulator */ |
| coefp = (ae_f16x4 *)cp; |
| a0 = AE_ZERO64(); |
| |
| /* Compute FIR filter for current channel with four |
| * taps per every loop iteration. Data is read from |
| * interleaved buffer with stride of channels count. |
| */ |
| for (i = 0; i < taps_div_4; i++) { |
| /* Load four coefficients */ |
| AE_LA16X4_IP(coef4, u, coefp); |
| |
| /* Load two data samples, place to high and |
| * low of data2. |
| */ |
| AE_L32_XC(d0, dp0, inc); |
| AE_L32_XC(d1, dp0, inc); |
| data2 = AE_SEL32_LL(d0, d1); |
| |
| /* Accumulate |
| * data2_h * coef4_3 + data2_l* coef4_2. |
| * The data is 32 bits Q1.31 and coefficient 16 bits |
| * Q1.15. The accumulator is Q17.47. |
| */ |
| AE_MULAAFD32X16_H3_L2(a0, data2, coef4); |
| |
| /* Repeat with next two samples */ |
| AE_L32_XC(d0, dp0, inc); |
| AE_L32_XC(d1, dp0, inc); |
| data2 = AE_SEL32_LL(d0, d1); |
| |
| /* Accumulate |
| * data2_h * coef4_1 + data2_l * coef4_0. |
| */ |
| AE_MULAAFD32X16_H1_L0(a0, data2, coef4); |
| } |
| |
| /* Scale FIR output with right shifts, round/saturate Q17.47 |
| * to Q1.31, and store 32 bit output. Advance write |
| * pointer to next sample. |
| */ |
| AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp, |
| sizeof(int32_t)); |
| } |
| } |
| |
| #else /* 32bit coefficients version */ |
| |
| static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0, |
| const int taps_div_4, const int shift, const int nch) |
| { |
| /* This function uses |
| * 6x 64 bit registers |
| * 3x integers |
| * 5x address pointers, |
| */ |
| ae_f64 a0; |
| ae_f64 a1; |
| ae_f24x2 data2 = AE_ZERO24(); |
| ae_f24x2 coef2 = AE_ZERO24(); |
| ae_f24x2 d0 = AE_ZERO24(); |
| ae_f24x2 d1 = AE_ZERO24(); |
| ae_f24x2 *coefp; |
| ae_f24x2 *dp; |
| ae_f24 *dp1; |
| ae_f24 *dp0; |
| int i; |
| int j; |
| ae_f32 *wp = wp0; |
| const int inc = nch * sizeof(int32_t); |
| |
| if (nch == 2) { |
| /* Move data pointer back by one sample to start from right |
| * channel sample. Discard read value p0. |
| */ |
| dp = (ae_f24x2 *)rp; |
| AE_L32F24_XC(d0, (ae_f24 *)dp, -sizeof(ae_f24)); |
| |
| /* Reset coefficient pointer and clear accumulator */ |
| coefp = (ae_f24x2 *)cp; |
| a0 = AE_ZERO64(); |
| a1 = AE_ZERO64(); |
| |
| /* Compute FIR filter for current channel with four |
| * taps per every loop iteration. Two coefficients |
| * are loaded simultaneously. Data is read |
| * from interleaved buffer with stride of channels |
| * count. |
| */ |
| for (i = 0; i < taps_div_4; i++) { |
| /* Load two coefficients. Coef2_h contains tap *coefp |
| * and coef2_l contains the next tap. |
| */ |
| /* TODO: Ensure coefficients are 64 bits aligned */ |
| AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2)); |
| |
| /* Load two data samples from two channels */ |
| AE_L32X2F24_XC(d0, dp, inc); /* r0, l0 */ |
| AE_L32X2F24_XC(d1, dp, inc); /* r1, l1 */ |
| |
| /* Select to d0 successive left channel samples, to d1 |
| * successive right channel samples. Then Accumulate |
| * to a0 and a1 |
| * data2_h * coef2_h + data2_l * coef2_l. The Q1.31 |
| * data and Q1.15 coefficients are used as 24 bits as |
| * Q1.23 values. |
| */ |
| data2 = AE_SELP24_LL(d0, d1); |
| AE_MULAAFP24S_HH_LL(a0, data2, coef2); |
| data2 = AE_SELP24_HH(d0, d1); |
| AE_MULAAFP24S_HH_LL(a1, data2, coef2); |
| |
| /* Repeat for next two taps */ |
| AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2)); |
| AE_L32X2F24_XC(d0, dp, inc); /* r2, l2 */ |
| AE_L32X2F24_XC(d1, dp, inc); /* r3, l3 */ |
| data2 = AE_SELP24_LL(d0, d1); |
| AE_MULAAFP24S_HH_LL(a0, data2, coef2); |
| data2 = AE_SELP24_HH(d0, d1); |
| AE_MULAAFP24S_HH_LL(a1, data2, coef2); |
| } |
| |
| /* Scale FIR output with right shifts, round/saturate |
| * to Q1.31, and store 32 bit output. |
| */ |
| AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp, |
| sizeof(int32_t)); |
| AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp, |
| sizeof(int32_t)); |
| |
| return; |
| } |
| |
| dp1 = (ae_f24 *)rp; |
| for (j = 0; j < nch; j++) { |
| /* Copy pointer and advance to next ch with dummy load */ |
| dp0 = dp1; |
| AE_L32F24_XC(data2, dp1, -sizeof(ae_f24)); |
| |
| /* Reset coefficient pointer and clear accumulator */ |
| coefp = (ae_f24x2 *)cp; |
| a0 = AE_ZERO64(); |
| |
| /* Compute FIR filter for current channel with four |
| * taps per every loop iteration. Data is read from |
| * interleaved buffer with stride of channels count. |
| */ |
| for (i = 0; i < taps_div_4; i++) { |
| /* Load two coefficients */ |
| coef2 = *coefp++; |
| |
| /* Load two data samples, place to high and |
| * low of data2. |
| */ |
| AE_L32F24_XC(d0, dp0, inc); |
| AE_L32F24_XC(d1, dp0, inc); |
| data2 = AE_SELP24_LL(d0, d1); |
| |
| /* Accumulate to data2_h * coef2_h + |
| * data2_l*coef2_l. The Q1.31 bit data is used |
| * as Q1.23 from MSB side bits of the 32 bit |
| * word. The accumulator m is Q17.47. |
| */ |
| AE_MULAAFD24_HH_LL(a0, data2, coef2); |
| |
| /* Repeat the same for next two filter taps */ |
| coef2 = *coefp++; |
| AE_L32F24_XC(d0, dp0, inc); |
| AE_L32F24_XC(d1, dp0, inc); |
| data2 = AE_SELP24_LL(d0, d1); |
| AE_MULAAFD24_HH_LL(a0, data2, coef2); |
| } |
| |
| /* Scale FIR output with right shifts, round/saturate Q17.47 |
| * to Q1.31, and store 32 bit output. Advance write |
| * pointer to next sample. |
| */ |
| AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp, |
| sizeof(int32_t)); |
| } |
| } |
| |
| #endif /* 32bit coefficients version */ |
| |
| void src_polyphase_stage_cir(struct src_stage_prm *s) |
| { |
| /* This function uses |
| * 1x 64 bit registers |
| * 16x integers |
| * 7x address pointers, |
| */ |
| ae_int32x2 q = AE_ZERO32(); |
| ae_f32 *rp; |
| ae_f32 *wp; |
| int i; |
| int n; |
| int m; |
| int n_wrap_buf; |
| int n_min; |
| struct src_state *fir = s->state; |
| struct src_stage *cfg = s->stage; |
| int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size]; |
| int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size]; |
| const void *cp; /* Can be int32_t or int16_t */ |
| const size_t out_size = fir->out_delay_size * sizeof(int32_t); |
| const int nch = s->nch; |
| const int nch_x_odm = cfg->odm * nch; |
| const int blk_in_words = nch * cfg->blk_in; |
| const int blk_out_words = nch * cfg->num_of_subfilters; |
| const int sz = sizeof(int32_t); |
| const int n_sz = -sizeof(int32_t); |
| const int rewind_sz = sz * (nch * (cfg->blk_in |
| + (cfg->num_of_subfilters - 1) * cfg->idm) - nch); |
| const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t); |
| const int taps_div_4 = cfg->subfilter_length >> 2; |
| |
| #if SRC_SHORT |
| const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t); |
| #else |
| const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t); |
| #endif |
| |
| for (n = 0; n < s->times; n++) { |
| /* Input data to filter */ |
| m = blk_in_words; |
| |
| /* Setup circular buffer for FIR input data delay */ |
| AE_SETCBEGIN0(fir->fir_delay); |
| AE_SETCEND0(fir_end); |
| |
| while (m > 0) { |
| /* Number of words until circular wrap */ |
| n_wrap_buf = s->x_end_addr - s->x_rptr; |
| n_min = (m < n_wrap_buf) ? m : n_wrap_buf; |
| m -= n_min; |
| for (i = 0; i < n_min; i++) { |
| /* Load 32 bits sample to accumulator, |
| * advance pointer. |
| */ |
| AE_L32_XP(q, (ae_int32 *)s->x_rptr, sz); |
| |
| /* Store to circular buffer, advance pointer */ |
| AE_S32_L_XC(q, (ae_int32 *)fir->fir_wp, n_sz); |
| } |
| |
| /* Check for wrap */ |
| src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size); |
| } |
| |
| /* Do filter */ |
| cp = cfg->coefs; /* Reset to 1st coefficient */ |
| rp = (ae_f32 *)fir->fir_wp; |
| |
| /* Do circular modification to pointer rp by amount of |
| * rewind to to data start. Loaded value q is discarded. |
| */ |
| AE_L32_XC(q, rp, rewind_sz); |
| |
| /* Reset FIR write pointer and compute all polyphase |
| * sub-filters. |
| */ |
| wp = (ae_f32 *)fir->out_rp; |
| for (i = 0; i < cfg->num_of_subfilters; i++) { |
| fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch); |
| wp += nch_x_odm; |
| cp += subfilter_size; |
| src_circ_inc_wrap((int32_t **)&wp, out_delay_end, |
| out_size); |
| |
| /* Circular advance pointer rp by number of |
| * channels x input delay multiplier. Loaded value q |
| * is discarded. |
| */ |
| AE_L32_XC(q, rp, nch_x_idm_sz); |
| } |
| |
| /* Output */ |
| |
| /* Setup circular buffer for SRC out delay access */ |
| AE_SETCBEGIN0(fir->out_delay); |
| AE_SETCEND0(out_delay_end); |
| m = blk_out_words; |
| while (m > 0) { |
| n_wrap_buf = s->y_end_addr - s->y_wptr; |
| n_min = (m < n_wrap_buf) ? m : n_wrap_buf; |
| m -= n_min; |
| for (i = 0; i < n_min; i++) { |
| /* Circular load followed by linear store, |
| * advance read and write pointers. |
| */ |
| AE_L32_XC(q, (ae_int32 *)fir->out_rp, sz); |
| AE_S32_L_XP(q, (ae_int32 *)s->y_wptr, sz); |
| } |
| |
| /* Check wrap */ |
| src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size); |
| } |
| } |
| } |
| |
| void src_polyphase_stage_cir_s24(struct src_stage_prm *s) |
| { |
| /* This function uses |
| * 1x 64 bit registers |
| * 16x integers |
| * 7x address pointers, |
| */ |
| ae_int32x2 q = AE_ZERO32(); |
| ae_f32 *rp; |
| ae_f32 *wp; |
| int i; |
| int n; |
| int m; |
| int n_wrap_buf; |
| int n_min; |
| |
| struct src_state *fir = s->state; |
| struct src_stage *cfg = s->stage; |
| int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size]; |
| int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size]; |
| const void *cp; /* Can be int32_t or int16_t */ |
| const size_t out_size = fir->out_delay_size * sizeof(int32_t); |
| const int nch = s->nch; |
| const int nch_x_odm = cfg->odm * nch; |
| const int blk_in_words = nch * cfg->blk_in; |
| const int blk_out_words = nch * cfg->num_of_subfilters; |
| const int sz = sizeof(int32_t); |
| const int n_sz = -sizeof(int32_t); |
| const int rewind_sz = sz * (nch * (cfg->blk_in |
| + (cfg->num_of_subfilters - 1) * cfg->idm) - nch); |
| const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t); |
| const int taps_div_4 = cfg->subfilter_length >> 2; |
| |
| #if SRC_SHORT |
| const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t); |
| #else |
| const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t); |
| #endif |
| |
| for (n = 0; n < s->times; n++) { |
| /* Input data */ |
| m = blk_in_words; |
| |
| /* Setup circular buffer for FIR input data delay */ |
| AE_SETCBEGIN0(fir->fir_delay); |
| AE_SETCEND0(fir_end); |
| |
| while (m > 0) { |
| /* Number of words without circular wrap */ |
| n_wrap_buf = s->x_end_addr - s->x_rptr; |
| n_min = (m < n_wrap_buf) ? m : n_wrap_buf; |
| m -= n_min; |
| for (i = 0; i < n_min; i++) { |
| /* Load 32 bits sample to accumulator |
| * and left shift by 8, advance read |
| * pointer. |
| */ |
| AE_L32_XP(q, (ae_int32 *)s->x_rptr, sz); |
| AE_S32_L_XC(AE_SLAI32(q, 8), |
| (ae_int32 *)fir->fir_wp, n_sz); |
| } |
| |
| /* Check for wrap */ |
| src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size); |
| } |
| |
| /* Do filter */ |
| cp = cfg->coefs; /* Reset to 1st coefficient */ |
| rp = (ae_f32 *)fir->fir_wp; |
| |
| /* Do circular modification to pointer rp by amount of |
| * rewind to to data start. Loaded value q is discarded. |
| */ |
| AE_L32_XC(q, rp, rewind_sz); |
| |
| /* Reset FIR output write pointer and compute all polyphase |
| * sub-filters. |
| */ |
| wp = (ae_f32 *)fir->out_rp; |
| for (i = 0; i < cfg->num_of_subfilters; i++) { |
| fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch); |
| wp += nch_x_odm; |
| cp += subfilter_size; |
| src_circ_inc_wrap((int32_t **)&wp, out_delay_end, |
| out_size); |
| |
| /* Circular advance pointer rp by number of |
| * channels x input delay multiplier. Loaded value q |
| * is discarded. |
| */ |
| AE_L32_XC(q, rp, nch_x_idm_sz); |
| } |
| |
| /* Output */ |
| |
| /* Setup circular buffer for SRC out delay access */ |
| AE_SETCBEGIN0(fir->out_delay); |
| AE_SETCEND0(out_delay_end); |
| m = blk_out_words; |
| while (m > 0) { |
| n_wrap_buf = s->y_end_addr - s->y_wptr; |
| n_min = (m < n_wrap_buf) ? m : n_wrap_buf; |
| m -= n_min; |
| for (i = 0; i < n_min; i++) { |
| /* Circular load for 32 bit sample, |
| * advance read pointer. |
| */ |
| AE_L32_XC(q, (ae_int32 *)fir->out_rp, sz); |
| |
| /* Store value as shifted right by 8 |
| * for sign extended 24 bit value, |
| * advance write pointer. |
| */ |
| AE_S32_L_XP(AE_SRAI32(q, 8), |
| (ae_int32 *)s->y_wptr, sz); |
| } |
| |
| /* Check wrap */ |
| src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size); |
| } |
| } |
| } |
| |
| #endif |