From f7762727c09c42489cb88668294859e40e6407e7 Mon Sep 17 00:00:00 2001 From: Serhiy Katsyuba Date: Thu, 23 Jan 2025 19:44:13 +0100 Subject: [PATCH 1/5] ipc4: mixin: Correct misleading comments Corrections to misleading comments in the HiFi3 implementation of mixin. Signed-off-by: Serhiy Katsyuba --- src/audio/mixin_mixout/mixin_mixout_hifi3.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/audio/mixin_mixout/mixin_mixout_hifi3.c b/src/audio/mixin_mixout/mixin_mixout_hifi3.c index da22be3634c5..d64b8944a2d8 100644 --- a/src/audio/mixin_mixout/mixin_mixout_hifi3.c +++ b/src/audio/mixin_mixout/mixin_mixout_hifi3.c @@ -47,7 +47,7 @@ static void mix_s16(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixe outu1 = AE_LA64_PP(out); m = n >> 2; left = n & 0x03; - /* process 4 frames per loop */ + /* process 4 samples per loop */ for (i = 0; i < m; i++) { AE_LA16X4_IP(in_sample, inu, in); AE_LA16X4_IP(out_sample, outu1, out); @@ -81,7 +81,7 @@ static void mix_s16(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixe inu = AE_LA64_PP(in); m = n >> 2; left = n & 0x03; - /* process 4 frames per loop */ + /* process 4 samples per loop */ for (i = 0; i < m; i++) { AE_LA16X4_IP(in_sample, inu, in); AE_SA16X4_IP(in_sample, outu2, out); @@ -138,7 +138,7 @@ static void mix_s16_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t outu1 = AE_LA64_PP(out); m = n >> 2; left = n & 0x03; - /* process 4 frames per loop */ + /* process 4 samples per loop */ for (i = 0; i < m; i++) { AE_LA16X4_IP(in_sample, inu, in); @@ -184,7 +184,7 @@ static void mix_s16_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t inu = AE_LA64_PP(in); m = n >> 2; left = n & 0x03; - /* process 4 frames per loop */ + /* process 4 samples per loop */ for (i = 0; i < m; i++) { AE_LA16X4_IP(in_sample, inu, in); From ffdfb3d434b1c55a09bc203fe4621012afd76b46 Mon Sep 17 00:00:00 2001 From: Serhiy Katsyuba Date: Thu, 23 Jan 2025 19:54:46 +0100 Subject: [PATCH 2/5] ipc4: mixin: Fixes for HiFi5 impl of mix functions Fixes HiFi5 impl of mix_s24() and mix_s32(). Signed-off-by: Serhiy Katsyuba --- src/audio/mixin_mixout/mixin_mixout_hifi5.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/audio/mixin_mixout/mixin_mixout_hifi5.c b/src/audio/mixin_mixout/mixin_mixout_hifi5.c index 14bea3d98773..19a78d027cc0 100644 --- a/src/audio/mixin_mixout/mixin_mixout_hifi5.c +++ b/src/audio/mixin_mixout/mixin_mixout_hifi5.c @@ -82,7 +82,7 @@ static void mix_s16(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixe inu = AE_LA128_PP(in); m = n >> 3; left = n & 0x07; - /* process 8 frames per loop */ + /* process 8 samples per loop */ for (i = 0; i < m; i++) { AE_LA16X4X2_IP(in_sample, in_sample1, inu, in); AE_SA16X4X2_IP(in_sample, in_sample1, outu2, out); @@ -137,7 +137,7 @@ static void mix_s24(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixe outu1 = AE_LA128_PP(out); m = n >> 2; left = n & 3; - /* process 2 samples per time */ + /* process 4 samples per time */ for (i = 0; i < m; i++) { AE_LA32X2X2_IP(in_sample, in_sample1, inu, in); AE_LA32X2X2_IP(out_sample, out_sample1, outu1, out); @@ -148,8 +148,8 @@ static void mix_s24(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixe } AE_SA128POS_FP(outu2, out); - /* process the left sample to avoid memory access overrun */ - if (left) { + /* process the left samples to avoid memory access overrun */ + for (i = 0; i < left; i++) { AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32)); AE_L32_IP(out_sample, (ae_int32 *)out, 0); out_sample = AE_ADD24S(in_sample, out_sample); @@ -174,8 +174,8 @@ static void mix_s24(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixe AE_SA32X2X2_IP(in_sample, in_sample1, outu2, out); } AE_SA128POS_FP(outu2, out); - /* process the left sample to avoid memory access overrun */ - if (left) { + /* process the left samples to avoid memory access overrun */ + for (i = 0; i < left; i++) { AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32)); AE_S32_L_IP(in_sample, (ae_int32 *)out, sizeof(ae_int32)); } @@ -231,8 +231,8 @@ static void mix_s32(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixe } AE_SA128POS_FP(outu2, out); - /* process the left sample to avoid memory access overrun */ - if (left) { + /* process the left samples to avoid memory access overrun */ + for (i = 0; i < left; i++) { AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32)); AE_L32_IP(out_sample, (ae_int32 *)out, 0); out_sample = AE_ADD32S(in_sample, out_sample); @@ -258,8 +258,8 @@ static void mix_s32(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixe AE_SA32X2X2_IP(in_sample, in_sample1, outu2, out); } AE_SA128POS_FP(outu2, out); - /* process the left sample to avoid memory access overrun */ - if (left) { + /* process the left samples to avoid memory access overrun */ + for (i = 0; i < left; i++) { AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32)); AE_S32_L_IP(in_sample, (ae_int32 *)out, sizeof(ae_int32)); } From d0e54b2b5ff2ee1e79e31ed4d75aba4ebb6c753e Mon Sep 17 00:00:00 2001 From: Serhiy Katsyuba Date: Thu, 30 Jan 2025 16:00:49 +0100 Subject: [PATCH 3/5] ipc4: mixin: Don't clear HiFi5 input AE_VALIGN reg Clearing of AE_VALIGN registers with AE_ZALIGN128() is only necessary when they are used for memory write operations. There is no need to do this for registers used for memory read. Signed-off-by: Serhiy Katsyuba --- src/audio/mixin_mixout/mixin_mixout_hifi5.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/audio/mixin_mixout/mixin_mixout_hifi5.c b/src/audio/mixin_mixout/mixin_mixout_hifi5.c index 19a78d027cc0..19b73ef1051a 100644 --- a/src/audio/mixin_mixout/mixin_mixout_hifi5.c +++ b/src/audio/mixin_mixout/mixin_mixout_hifi5.c @@ -21,8 +21,8 @@ static void mix_s16(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixe ae_int16x4 out_sample, out_sample1; ae_int16x8 *in; ae_int16x8 *out; - ae_valignx2 inu = AE_ZALIGN128(); - ae_valignx2 outu1 = AE_ZALIGN128(); + ae_valignx2 inu; + ae_valignx2 outu1; ae_valignx2 outu2 = AE_ZALIGN128(); /* cir_buf_wrap() is required and is done below in a loop */ ae_int16 *dst = (ae_int16 *)sink->ptr + start_sample; @@ -111,8 +111,8 @@ static void mix_s24(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixe ae_int32x2 out_sample, out_sample1; ae_int32x4 *in; ae_int32x4 *out; - ae_valignx2 inu = AE_ZALIGN128(); - ae_valignx2 outu1 = AE_ZALIGN128(); + ae_valignx2 inu; + ae_valignx2 outu1; ae_valignx2 outu2 = AE_ZALIGN128(); /* cir_buf_wrap() is required and is done below in a loop */ int32_t *dst = (int32_t *)sink->ptr + start_sample; @@ -195,8 +195,8 @@ static void mix_s32(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixe ae_int32x2 out_sample, out_sample1; ae_int32x4 *in; ae_int32x4 *out; - ae_valignx2 inu = AE_ZALIGN128(); - ae_valignx2 outu1 = AE_ZALIGN128(); + ae_valignx2 inu; + ae_valignx2 outu1; ae_valignx2 outu2 = AE_ZALIGN128(); /* cir_buf_wrap() is required and is done below in a loop */ int32_t *dst = (int32_t *)sink->ptr + start_sample; From cad6f822e1e6ba9fef3cc3cbce4d60457ba10db7 Mon Sep 17 00:00:00 2001 From: Serhiy Katsyuba Date: Fri, 24 Jan 2025 11:38:38 +0100 Subject: [PATCH 4/5] ipc4: mixin: Fix HiFi5 impl of 24-bit mixing AE_ADD24S() expects input arguments to be Q9.23 values. Therefore, negative 24-bit values in a 32-bit container should have their sign extended to the upper 8 bits. Our other implementations of 24-bit mixing all perform sign extension prior to mixing and do not rely on samples being already sign-extended. Signed-off-by: Serhiy Katsyuba --- src/audio/mixin_mixout/mixin_mixout_hifi5.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/audio/mixin_mixout/mixin_mixout_hifi5.c b/src/audio/mixin_mixout/mixin_mixout_hifi5.c index 19b73ef1051a..cc49bf7a79de 100644 --- a/src/audio/mixin_mixout/mixin_mixout_hifi5.c +++ b/src/audio/mixin_mixout/mixin_mixout_hifi5.c @@ -142,7 +142,11 @@ static void mix_s24(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixe AE_LA32X2X2_IP(in_sample, in_sample1, inu, in); AE_LA32X2X2_IP(out_sample, out_sample1, outu1, out); out--; - out_sample = AE_ADD24S(in_sample, out_sample); + /* sign extent in samples as AE_ADD24S expects Q9.23 arguments */ + in_sample = AE_SLAI24S(AE_MOVF24X2_FROMINT32X2(in_sample), 0); + in_sample1 = AE_SLAI24S(AE_MOVF24X2_FROMINT32X2(in_sample1), 0); + /* out samples are already sign extended by other mixin in a loop below */ + out_sample = AE_ADD24S(in_sample, out_sample); out_sample1 = AE_ADD24S(in_sample1, out_sample1); AE_SA32X2X2_IP(out_sample, out_sample1, outu2, out); } @@ -152,6 +156,8 @@ static void mix_s24(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixe for (i = 0; i < left; i++) { AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32)); AE_L32_IP(out_sample, (ae_int32 *)out, 0); + /* sign extension */ + in_sample = AE_SLAI24S(AE_MOVF24X2_FROMINT32X2(in_sample), 0); out_sample = AE_ADD24S(in_sample, out_sample); AE_S32_L_IP(out_sample, (ae_int32 *)out, sizeof(ae_int32)); } @@ -171,12 +177,17 @@ static void mix_s24(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixe left = n & 3; for (i = 0; i < m; i++) { AE_LA32X2X2_IP(in_sample, in_sample1, inu, in); + /* sign extension */ + in_sample = AE_SLAI24S(AE_MOVF24X2_FROMINT32X2(in_sample), 0); + in_sample1 = AE_SLAI24S(AE_MOVF24X2_FROMINT32X2(in_sample1), 0); AE_SA32X2X2_IP(in_sample, in_sample1, outu2, out); } AE_SA128POS_FP(outu2, out); /* process the left samples to avoid memory access overrun */ for (i = 0; i < left; i++) { AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32)); + /* sign extension */ + in_sample = AE_SLAI24S(AE_MOVF24X2_FROMINT32X2(in_sample), 0); AE_S32_L_IP(in_sample, (ae_int32 *)out, sizeof(ae_int32)); } } From fad27a264b10059b99c68c8ecc45e17ba9ebbce6 Mon Sep 17 00:00:00 2001 From: Serhiy Katsyuba Date: Fri, 24 Jan 2025 11:44:40 +0100 Subject: [PATCH 5/5] ipc4: mixin: Add "mix with gain" HiFi5 impl Adds HiFi5 implementation of "mix with gain" functions. Signed-off-by: Serhiy Katsyuba --- src/audio/mixin_mixout/mixin_mixout_hifi5.c | 293 +++++++++++++++++++- 1 file changed, 290 insertions(+), 3 deletions(-) diff --git a/src/audio/mixin_mixout/mixin_mixout_hifi5.c b/src/audio/mixin_mixout/mixin_mixout_hifi5.c index cc49bf7a79de..7e42e0f8d376 100644 --- a/src/audio/mixin_mixout/mixin_mixout_hifi5.c +++ b/src/audio/mixin_mixout/mixin_mixout_hifi5.c @@ -98,6 +98,107 @@ static void mix_s16(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixe } } } + +static void mix_s16_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples, + const struct cir_buf_ptr *source, + int32_t sample_count, uint16_t gain) +{ + int samples_to_mix, samples_to_copy, left_samples; + int n, nmax, i, m, left; + ae_int16x4 in_sample, in_sample1; + ae_int16x4 out_sample, out_sample1; + ae_int16x8 *in; + ae_int16x8 *out; + ae_valignx2 inu; + ae_valignx2 outu1; + ae_valignx2 outu2 = AE_ZALIGN128(); + /* cir_buf_wrap() is required and is done below in a loop */ + ae_int16 *dst = (ae_int16 *)sink->ptr + start_sample; + ae_int16 *src = source->ptr; + ae_f16x4 gain_vec; + + /* this func does not support unity gain as 1 cannot be represented as Q1.15 value */ + assert(gain < IPC4_MIXIN_UNITY_GAIN); + + gain_vec = AE_L16_I((ae_int16 *)&gain, 0); + gain_vec = AE_SLAI16S(gain_vec, 5); /* convert to Q1.15 */ + + assert(mixed_samples >= start_sample); + samples_to_mix = AE_MIN32(mixed_samples - start_sample, sample_count); + samples_to_copy = sample_count - samples_to_mix; + n = 0; + + for (left_samples = samples_to_mix; left_samples > 0; left_samples -= n) { + src = cir_buf_wrap(src + n, source->buf_start, source->buf_end); + dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end); + /* calculate the remaining samples*/ + nmax = (ae_int16 *)source->buf_end - src; + n = AE_MIN32(left_samples, nmax); + nmax = (ae_int16 *)sink->buf_end - dst; + n = AE_MIN32(n, nmax); + in = (ae_int16x8 *)src; + out = (ae_int16x8 *)dst; + inu = AE_LA128_PP(in); + outu1 = AE_LA128_PP(out); + m = n >> 3; + left = n & 0x07; + /* process 8 samples per loop */ + for (i = 0; i < m; i++) { + AE_LA16X4X2_IP(in_sample, in_sample1, inu, in); + AE_LA16X4X2_IP(out_sample, out_sample1, outu1, out); + out--; + in_sample = AE_MULFP16X4RS(in_sample, gain_vec); + in_sample1 = AE_MULFP16X4RS(in_sample1, gain_vec); + out_sample = AE_ADD16S(in_sample, out_sample); + out_sample1 = AE_ADD16S(in_sample1, out_sample1); + AE_SA16X4X2_IP(out_sample, out_sample1, outu2, out); + } + AE_SA128POS_FP(outu2, out); + + /* process the left samples that less than 8 + * one by one to avoid memory access overrun + */ + for (i = 0; i < left ; i++) { + AE_L16_IP(in_sample, (ae_int16 *)in, sizeof(ae_int16)); + AE_L16_IP(out_sample, (ae_int16 *)out, 0); + in_sample = AE_MULFP16X4RS(in_sample, gain_vec); + out_sample = AE_ADD16S(in_sample, out_sample); + AE_S16_0_IP(out_sample, (ae_int16 *)out, sizeof(ae_int16)); + } + } + + for (left_samples = samples_to_copy; left_samples > 0; left_samples -= n) { + src = cir_buf_wrap(src + n, source->buf_start, source->buf_end); + dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end); + /* calculate the remaining samples*/ + nmax = (ae_int16 *)source->buf_end - src; + n = AE_MIN32(left_samples, nmax); + nmax = (ae_int16 *)sink->buf_end - dst; + n = AE_MIN32(n, nmax); + in = (ae_int16x8 *)src; + out = (ae_int16x8 *)dst; + inu = AE_LA128_PP(in); + m = n >> 3; + left = n & 0x07; + /* process 8 samples per loop */ + for (i = 0; i < m; i++) { + AE_LA16X4X2_IP(in_sample, in_sample1, inu, in); + in_sample = AE_MULFP16X4RS(in_sample, gain_vec); + in_sample1 = AE_MULFP16X4RS(in_sample1, gain_vec); + AE_SA16X4X2_IP(in_sample, in_sample1, outu2, out); + } + AE_SA128POS_FP(outu2, out); + + /* process the left samples that less than 8 + * one by one to avoid memory access overrun + */ + for (i = 0; i < left ; i++) { + AE_L16_IP(in_sample, (ae_int16 *)in, sizeof(ae_int16)); + in_sample = AE_MULFP16X4RS(in_sample, gain_vec); + AE_S16_0_IP(in_sample, (ae_int16 *)out, sizeof(ae_int16)); + } + } +} #endif /* CONFIG_FORMAT_S16LE */ #if CONFIG_FORMAT_S24LE @@ -193,6 +294,102 @@ static void mix_s24(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixe } } +static void mix_s24_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples, + const struct cir_buf_ptr *source, + int32_t sample_count, uint16_t gain) +{ + int samples_to_mix, samples_to_copy, left_samples; + int n, nmax, i, m, left; + ae_int32x2 in_sample, in_sample1; + ae_int32x2 out_sample, out_sample1; + ae_int32x4 *in; + ae_int32x4 *out; + ae_valignx2 inu; + ae_valignx2 outu1; + ae_valignx2 outu2 = AE_ZALIGN128(); + /* cir_buf_wrap() is required and is done below in a loop */ + int32_t *dst = (int32_t *)sink->ptr + start_sample; + int32_t *src = source->ptr; + ae_f24x2 gain_vec; + ae_int32 gain32 = (ae_int32)gain; + + /* this func does not support unity gain as 1 cannot be represented as Q1.23 value */ + assert(gain < IPC4_MIXIN_UNITY_GAIN); + + gain_vec = AE_MOVF24X2_FROMINT32X2(AE_L32_I(&gain32, 0)); + gain_vec = AE_SLAI24S(gain_vec, 13); /* convert to Q1.23 */ + + assert(mixed_samples >= start_sample); + samples_to_mix = AE_MIN32(mixed_samples - start_sample, sample_count); + samples_to_copy = sample_count - samples_to_mix; + n = 0; + + for (left_samples = samples_to_mix; left_samples > 0; left_samples -= n) { + src = cir_buf_wrap(src + n, source->buf_start, source->buf_end); + dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end); + /* calculate the remaining samples*/ + nmax = (int32_t *)source->buf_end - src; + n = AE_MIN32(left_samples, nmax); + nmax = (int32_t *)sink->buf_end - dst; + n = AE_MIN32(n, nmax); + in = (ae_int32x4 *)src; + out = (ae_int32x4 *)dst; + inu = AE_LA128_PP(in); + outu1 = AE_LA128_PP(out); + m = n >> 2; + left = n & 3; + /* process 4 samples per time */ + for (i = 0; i < m; i++) { + AE_LA32X2X2_IP(in_sample, in_sample1, inu, in); + AE_LA32X2X2_IP(out_sample, out_sample1, outu1, out); + out--; + in_sample = AE_MULFP24X2R(AE_MOVF24X2_FROMINT32X2(in_sample), gain_vec); + in_sample1 = AE_MULFP24X2R(AE_MOVF24X2_FROMINT32X2(in_sample1), gain_vec); + /* out samples are already sign extended by other mixin in a loop below */ + out_sample = AE_ADD24S(in_sample, out_sample); + out_sample1 = AE_ADD24S(in_sample1, out_sample1); + AE_SA32X2X2_IP(out_sample, out_sample1, outu2, out); + } + AE_SA128POS_FP(outu2, out); + + /* process the left samples to avoid memory access overrun */ + for (i = 0; i < left; i++) { + AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32)); + AE_L32_IP(out_sample, (ae_int32 *)out, 0); + in_sample = AE_MULFP24X2R(AE_MOVF24X2_FROMINT32X2(in_sample), gain_vec); + /* out samples are already sign extended by other mixin in a loop below */ + out_sample = AE_ADD24S(in_sample, out_sample); + AE_S32_L_IP(out_sample, (ae_int32 *)out, sizeof(ae_int32)); + } + } + + for (left_samples = samples_to_copy; left_samples > 0; left_samples -= n) { + src = cir_buf_wrap(src + n, source->buf_start, source->buf_end); + dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end); + nmax = (int32_t *)source->buf_end - src; + n = AE_MIN32(left_samples, nmax); + nmax = (int32_t *)sink->buf_end - dst; + n = AE_MIN32(n, nmax); + in = (ae_int32x4 *)src; + out = (ae_int32x4 *)dst; + inu = AE_LA128_PP(in); + m = n >> 2; + left = n & 3; + for (i = 0; i < m; i++) { + AE_LA32X2X2_IP(in_sample, in_sample1, inu, in); + in_sample = AE_MULFP24X2R(AE_MOVF24X2_FROMINT32X2(in_sample), gain_vec); + in_sample1 = AE_MULFP24X2R(AE_MOVF24X2_FROMINT32X2(in_sample1), gain_vec); + AE_SA32X2X2_IP(in_sample, in_sample1, outu2, out); + } + AE_SA128POS_FP(outu2, out); + /* process the left samples to avoid memory access overrun */ + for (i = 0; i < left; i++) { + AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32)); + in_sample = AE_MULFP24X2R(AE_MOVF24X2_FROMINT32X2(in_sample), gain_vec); + AE_S32_L_IP(in_sample, (ae_int32 *)out, sizeof(ae_int32)); + } + } +} #endif /* CONFIG_FORMAT_S24LE */ #if CONFIG_FORMAT_S32LE @@ -277,18 +474,108 @@ static void mix_s32(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixe } } +static void mix_s32_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples, + const struct cir_buf_ptr *source, + int32_t sample_count, uint16_t gain) +{ + int samples_to_mix, samples_to_copy, left_samples; + int n, nmax, i, m, left; + ae_int32x2 in_sample, in_sample1; + ae_int32x2 out_sample, out_sample1; + ae_int32x4 *in; + ae_int32x4 *out; + ae_valignx2 inu; + ae_valignx2 outu1; + ae_valignx2 outu2 = AE_ZALIGN128(); + /* cir_buf_wrap() is required and is done below in a loop */ + int32_t *dst = (int32_t *)sink->ptr + start_sample; + int32_t *src = source->ptr; + ae_f16x4 gain_vec; + + /* this func does not support unity gain as 1 cannot be represented as Q1.15 value */ + assert(gain < IPC4_MIXIN_UNITY_GAIN); + + gain_vec = AE_L16_I((ae_int16 *)&gain, 0); + gain_vec = AE_SLAI16S(gain_vec, 5); /* convert to Q1.15 */ + + assert(mixed_samples >= start_sample); + samples_to_mix = AE_MIN32(mixed_samples - start_sample, sample_count); + samples_to_copy = sample_count - samples_to_mix; + n = 0; + + for (left_samples = samples_to_mix; left_samples > 0; left_samples -= n) { + src = cir_buf_wrap(src + n, source->buf_start, source->buf_end); + dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end); + /* calculate the remaining samples*/ + nmax = (int32_t *)source->buf_end - src; + n = AE_MIN32(left_samples, nmax); + nmax = (int32_t *)sink->buf_end - dst; + n = AE_MIN32(n, nmax); + in = (ae_int32x4 *)src; + out = (ae_int32x4 *)dst; + inu = AE_LA128_PP(in); + outu1 = AE_LA128_PP(out); + m = n >> 2; + left = n & 3; + for (i = 0; i < m; i++) { + AE_LA32X2X2_IP(in_sample, in_sample1, inu, in); + AE_LA32X2X2_IP(out_sample, out_sample1, outu1, out); + out--; + AE_MULAFP32X16X2RS_L(out_sample, in_sample, gain_vec); + AE_MULAFP32X16X2RS_L(out_sample1, in_sample1, gain_vec); + AE_SA32X2X2_IP(out_sample, out_sample1, outu2, out); + } + AE_SA128POS_FP(outu2, out); + + /* process the left samples to avoid memory access overrun */ + for (i = 0; i < left; i++) { + AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32)); + AE_L32_IP(out_sample, (ae_int32 *)out, 0); + AE_MULAFP32X16X2RS_L(out_sample, in_sample, gain_vec); + AE_S32_L_IP(out_sample, (ae_int32 *)out, sizeof(ae_int32)); + } + } + + for (left_samples = samples_to_copy; left_samples > 0; left_samples -= n) { + src = cir_buf_wrap(src + n, source->buf_start, source->buf_end); + dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end); + /* calculate the remaining samples*/ + nmax = (int32_t *)source->buf_end - src; + n = AE_MIN32(left_samples, nmax); + nmax = (int32_t *)sink->buf_end - dst; + n = AE_MIN32(n, nmax); + in = (ae_int32x4 *)src; + out = (ae_int32x4 *)dst; + inu = AE_LA128_PP(in); + m = n >> 2; + left = n & 3; + for (i = 0; i < m; i++) { + AE_LA32X2X2_IP(in_sample, in_sample1, inu, in); + in_sample = AE_MULFP32X16X2RS_L(in_sample, gain_vec); + in_sample1 = AE_MULFP32X16X2RS_L(in_sample1, gain_vec); + AE_SA32X2X2_IP(in_sample, in_sample1, outu2, out); + } + AE_SA128POS_FP(outu2, out); + /* process the left samples to avoid memory access overrun */ + for (i = 0; i < left; i++) { + AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32)); + in_sample = AE_MULFP32X16X2RS_L(in_sample, gain_vec); + AE_S32_L_IP(in_sample, (ae_int32 *)out, sizeof(ae_int32)); + } + } +} #endif /* CONFIG_FORMAT_S32LE */ /* TODO: implement mixing functions with gain support!*/ __cold_rodata const struct mix_func_map mix_func_map[] = { #if CONFIG_FORMAT_S16LE - { SOF_IPC_FRAME_S16_LE, mix_s16, mix_s16 }, + { SOF_IPC_FRAME_S16_LE, mix_s16, mix_s16_gain }, #endif #if CONFIG_FORMAT_S24LE - { SOF_IPC_FRAME_S24_4LE, mix_s24, mix_s24 }, + { SOF_IPC_FRAME_S24_4LE, mix_s24, mix_s24_gain }, #endif #if CONFIG_FORMAT_S32LE - { SOF_IPC_FRAME_S32_LE, mix_s32, mix_s32 } + { SOF_IPC_FRAME_S32_LE, mix_s32, mix_s32_gain } #endif };