Skip to content

Commit

Permalink
NEON
Browse files Browse the repository at this point in the history
  • Loading branch information
louist103 committed Nov 24, 2024
1 parent 5e98ea5 commit b6a3bf8
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 7 deletions.
81 changes: 75 additions & 6 deletions mm/2s2h/mixer.c
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
//! This file is always optimized by a rule in the CMakeList. This is done because the SIMD functions are very large when unoptimized and clang does not allow optimizing a single function.
#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>

#include "mixer.h"
#include "attributes.h"
#ifndef __clang__
#pragma GCC optimize("unroll-loops")
#endif
Expand Down Expand Up @@ -69,6 +69,7 @@ static int16_t resample_table[64][4] = {
};

static void aMixImplSSE2(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr);
static void aMixImplNEON(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr);

static inline int16_t clamp16(int32_t v) {
if (v < -0x8000) {
Expand Down Expand Up @@ -300,7 +301,7 @@ void aEnvMixerImpl(uint16_t in_addr, uint16_t n_samples, bool swap_reverb, bool
} while (n > 0);
}

static void FORCE_OPTIMIZE aMixImplRef(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) {
static void aMixImplRef(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) {
int nbytes = ROUND_UP_32(ROUND_DOWN_16(count << 4));
int16_t* in = BUF_S16(in_addr);
int16_t* out = BUF_S16(out_addr);
Expand Down Expand Up @@ -330,6 +331,8 @@ static void FORCE_OPTIMIZE aMixImplRef(uint16_t count, int16_t gain, uint16_t in
void aMixImpl(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) {
#if __SSE2__
aMixImplSSE2(count, gain, in_addr, out_addr);
#elif defined (__ARM_NEON)
aMixImplNEON(count, gain, in_addr, out_addr);
#else
aMixImplRef(count, gain, in_addr, out_addr);
#endif
Expand Down Expand Up @@ -572,9 +575,13 @@ void aUnkCmd19Impl(uint8_t f, uint16_t count, uint16_t out_addr, uint16_t in_add
// A note about FORCE_OPTIMIZE...
// Compilers don't handle SIMD code well when not optimizing. It is unlikely that this code will need to be debugged
// outside of specific audio issues. We can assume it should always be optimized.

// SIMD operations expect aligned data
#include "align_asset_macro.h"

#if defined(__SSE2__)
#include <immintrin.h>
#include "align_asset_macro.h"


static const ALIGN_ASSET(16) int16_t x7fff[8] = {
0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,
Expand All @@ -586,7 +593,7 @@ static const ALIGN_ASSET(16) int32_t x4000[4] = {
0x4000,
};

FORCE_OPTIMIZE static void aMixImplSSE2(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) {
static void aMixImplSSE2(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) {
int nbytes = ROUND_UP_32(ROUND_DOWN_16(count << 4));
int16_t* in = BUF_S16(in_addr);
int16_t* out = BUF_S16(out_addr);
Expand Down Expand Up @@ -653,13 +660,75 @@ FORCE_OPTIMIZE static void aMixImplSSE2(uint16_t count, int16_t gain, uint16_t i
}
}
#endif
#if defined (__ARM_NEON)
#include <arm_neon.h>
static const int32_t x4000Arr[4] = {0x4000, 0x4000, 0x4000, 0x4000};
void aMixImplNEON(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) {
int nbytes = ROUND_UP_32(ROUND_DOWN_16(count << 4));
int16_t* in = BUF_S16(in_addr);
int16_t* out = BUF_S16(out_addr);
int i;
int32_t sample;


if (gain == -0x8000) {
while (nbytes > 0) {
for (unsigned int i = 0; i < 2; i++) {
int16x8_t outVec = vld1q_s16(out);
int16x8_t inVec = vld1q_s16(in);
int16x8_t subVec = vqsubq_s16(outVec, inVec);
vst1q_s16(out, subVec);
nbytes -= 8 * sizeof(int16_t);
out += 8;
in += 8;
}
}
}
int16x8_t gainVec = vdupq_n_s16(gain);
int32x4_t x4000Vec = vld1q_s32(x4000Arr);
while (nbytes > 0) {
for (unsigned int i = 0; i < 2; i++) {
//for (i = 0; i < 16; i++) {
int16x8_t outVec = vld1q_s16(out);
int16x8_t inVec = vld1q_s16(in);
int16x4_t outLoVec = vget_low_s16(outVec);
int16x8_t outLoVec2 = vcombine_s16(outLoVec, outLoVec);
int16x4_t inLoVec = vget_low_s16(inVec);
int16x8_t inLoVec2 = vcombine_s16(inLoVec, inLoVec);
int32x4_t outX7fffHiVec = vmull_high_n_s16(outVec, 0x7FFF);
int32x4_t outX7fffLoVec = vmull_high_n_s16(outLoVec2, 0x7FFF);

int32x4_t inGainLoVec = vmull_high_s16(inLoVec2, gainVec);
int32x4_t inGainHiVec = vmull_high_s16(inVec, gainVec);
int32x4_t addVecLo = vaddq_s32(outX7fffLoVec, inGainLoVec);
int32x4_t addVecHi = vaddq_s32(outX7fffHiVec, inGainHiVec);
addVecHi = vaddq_s32(addVecHi, x4000Vec);
addVecLo = vaddq_s32(addVecLo, x4000Vec);
int32x4_t shiftVecHi = vshrq_n_s32(addVecHi, 15);
int32x4_t shiftVecLo = vshrq_n_s32(addVecLo, 15);
int16x4_t shiftedNarrowHiVec = vqmovn_s32(shiftVecHi);
int16x4_t shiftedNarrowLoVec = vqmovn_s32(shiftVecLo);
vst1_s16(out, shiftedNarrowLoVec);
out += 4;
vst1_s16(out, shiftedNarrowHiVec);
//int16x8_t finalVec = vcombine_s16(shiftedNarrowLoVec, shiftedNarrowHiVec);
//vst1q_s16(out, finalVec);
out += 4;
in +=8;

nbytes -= 8 * sizeof(int16_t);
}
}
}
#endif

#if 0
static const ALIGN_ASSET(32) int16_t x7fff[16] = { 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,};
static const ALIGN_ASSET(32) int32_t x4000[8] = { 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000};

#pragma GCC target("avx2")
// AVX2 version of the SSE2 implementation above. AVX2 wasn't released until 2014 and I don't have a good way of checking for it at compile time.
FORCE_OPTIMIZE void aMixImpl256(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) {
void aMixImpl256(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) {
int nbytes = ROUND_UP_32(ROUND_DOWN_16(count << 4));
int16_t* in = BUF_S16(in_addr);
int16_t* out = BUF_S16(out_addr);
Expand Down Expand Up @@ -721,4 +790,4 @@ FORCE_OPTIMIZE void aMixImpl256(uint16_t count, int16_t gain, uint16_t in_addr,
nbytes -= 16 * sizeof(int16_t);
}
}
#endif
#endif
8 changes: 7 additions & 1 deletion mm/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.16.0 FATAL_ERROR)
cmake_minimum_required(VERSION 3.16.0 FATAL_ERROR)

set(CMAKE_SYSTEM_VERSION 10.0 CACHE STRING "" FORCE)

Expand Down Expand Up @@ -530,6 +530,12 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang|AppleClang")
)
endif()
endif()

# This file contains a few SIMD functions. Those functions are much more inefficient without optimizations. MSVC and GCC allow for optimizing a single function, but clang does not.
set_source_files_properties(2s2h/mixer.c PROPERTIES
COMPILE_FLAGS "-O2"
)

################################################################################
# Pre build events
################################################################################
Expand Down

0 comments on commit b6a3bf8

Please sign in to comment.