From 546f9dddb471333999b48ea115905ba63e454e2a Mon Sep 17 00:00:00 2001 From: Fabian 'ryg' Giesen Date: Mon, 4 Nov 2024 15:07:02 -0800 Subject: [PATCH] Add gatherf_byte_inds for gathers using byte indices from memory (#511) Adds a new function to wrap gathers using byte indices from memory, avoiding the byte-to-int conversion for ISAs that don't have native gathers. Adds a new build option ASTCENC_X86_GATHERS (default ON) to allow builds to disable use of native gathers on X86 as they are much slower than scalar fallbacks on some microarchitectures (AMD Zen, pre-Skylake Intel). Co-authored-by: Fabian Giesen Co-authored-by: Pete Harris --- CMakeLists.txt | 2 + Docs/Building.md | 11 ++++++ Source/astcenc_averages_and_directions.cpp | 18 ++++----- .../astcenc_ideal_endpoints_and_weights.cpp | 38 +++++++++---------- Source/astcenc_mathlib.h | 2 + Source/astcenc_pick_best_endpoint_format.cpp | 10 ++--- Source/astcenc_vecmathlib.h | 2 + Source/astcenc_vecmathlib_avx2_8.h | 27 +++++++++++++ Source/astcenc_vecmathlib_neon_4.h | 19 ++++++++++ Source/astcenc_vecmathlib_none_4.h | 12 ++++++ Source/astcenc_vecmathlib_sse_4.h | 19 +++++++++- Source/astcenc_vecmathlib_sve_8.h | 10 +++++ Source/cmake_core.cmake | 10 ++++- 13 files changed, 144 insertions(+), 36 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 001dede5d..38e334259 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -51,6 +51,7 @@ option(ASTCENC_UBSAN "Enable astcenc builds with undefined behavior sanitizer") option(ASTCENC_UNITTEST "Enable astcenc builds with unit tests") option(ASTCENC_INVARIANCE "Enable astcenc floating point invariance" ON) option(ASTCENC_CLI "Enable build of astcenc command line tools" ON) +option(ASTCENC_X86_GATHERS "Enable use of native x86 gathers" ON) # Preflight for some macOS-specific build options if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin") @@ -127,6 +128,7 @@ message(STATUS "x86-64 backend options") printopt("AVX2 backend " ${ASTCENC_ISA_AVX2}) printopt("SSE4.1 backend " ${ASTCENC_ISA_SSE41}) printopt("SSE2 backend " ${ASTCENC_ISA_SSE2}) +printopt("Use native gathers " ${ASTCENC_X86_GATHERS}) message(STATUS "Agnostic backend options") printopt("NONE backend " ${ASTCENC_ISA_NONE}) printopt("NATIVE backend " ${ASTCENC_ISA_NATIVE}) diff --git a/Docs/Building.md b/Docs/Building.md index f08170555..475226e4c 100644 --- a/Docs/Building.md +++ b/Docs/Building.md @@ -203,6 +203,17 @@ To enable this binary variant add `-DASTCENC_ISA_NONE=ON` to the CMake command line when configuring. It is NOT recommended to use this for production; it is significantly slower than the vectorized SIMD builds. +### No x86 gather instruction builds + +On many x86 microarchitectures the native AVX gather instructions are slower +than simply performing manual scalar loads and combining the results. Gathers +are enabled by default, but can be disabled by setting the CMake option +`-DASTCENC_X86_GATHERS=OFF` on the command line when configuring. + +Note that we have seen mixed results when compiling the scalar fallback path, +so we would recommend testing which option works best for the compiler and +microarchitecture pairing that you are targeting. + ### Test builds We support building unit tests. These use the `googletest` framework, which is diff --git a/Source/astcenc_averages_and_directions.cpp b/Source/astcenc_averages_and_directions.cpp index dcff0d224..8e2f8d8c4 100644 --- a/Source/astcenc_averages_and_directions.cpp +++ b/Source/astcenc_averages_and_directions.cpp @@ -778,12 +778,12 @@ void compute_error_squared_rgba( for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) { vmask mask = lane_ids < vint(texel_count); - vint texel_idxs(texel_indexes + i); + const uint8_t* texel_idxs = texel_indexes + i; - vfloat data_r = gatherf(blk.data_r, texel_idxs); - vfloat data_g = gatherf(blk.data_g, texel_idxs); - vfloat data_b = gatherf(blk.data_b, texel_idxs); - vfloat data_a = gatherf(blk.data_a, texel_idxs); + vfloat data_r = gatherf_byte_inds(blk.data_r, texel_idxs); + vfloat data_g = gatherf_byte_inds(blk.data_g, texel_idxs); + vfloat data_b = gatherf_byte_inds(blk.data_b, texel_idxs); + vfloat data_a = gatherf_byte_inds(blk.data_a, texel_idxs); vfloat uncor_param = (data_r * l_uncor_bs0) + (data_g * l_uncor_bs1) @@ -892,11 +892,11 @@ void compute_error_squared_rgb( for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) { vmask mask = lane_ids < vint(texel_count); - vint texel_idxs(texel_indexes + i); + const uint8_t* texel_idxs = texel_indexes + i; - vfloat data_r = gatherf(blk.data_r, texel_idxs); - vfloat data_g = gatherf(blk.data_g, texel_idxs); - vfloat data_b = gatherf(blk.data_b, texel_idxs); + vfloat data_r = gatherf_byte_inds(blk.data_r, texel_idxs); + vfloat data_g = gatherf_byte_inds(blk.data_g, texel_idxs); + vfloat data_b = gatherf_byte_inds(blk.data_b, texel_idxs); vfloat uncor_param = (data_r * l_uncor_bs0) + (data_g * l_uncor_bs1) diff --git a/Source/astcenc_ideal_endpoints_and_weights.cpp b/Source/astcenc_ideal_endpoints_and_weights.cpp index ec680dd5e..804ee42fe 100644 --- a/Source/astcenc_ideal_endpoints_and_weights.cpp +++ b/Source/astcenc_ideal_endpoints_and_weights.cpp @@ -41,16 +41,16 @@ static vfloat bilinear_infill_vla( unsigned int index ) { // Load the bilinear filter texel weight indexes in the decimated grid - vint weight_idx0 = vint(di.texel_weights_tr[0] + index); - vint weight_idx1 = vint(di.texel_weights_tr[1] + index); - vint weight_idx2 = vint(di.texel_weights_tr[2] + index); - vint weight_idx3 = vint(di.texel_weights_tr[3] + index); + const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index; + const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index; + const uint8_t* weight_idx2 = di.texel_weights_tr[2] + index; + const uint8_t* weight_idx3 = di.texel_weights_tr[3] + index; // Load the bilinear filter weights from the decimated grid - vfloat weight_val0 = gatherf(weights, weight_idx0); - vfloat weight_val1 = gatherf(weights, weight_idx1); - vfloat weight_val2 = gatherf(weights, weight_idx2); - vfloat weight_val3 = gatherf(weights, weight_idx3); + vfloat weight_val0 = gatherf_byte_inds(weights, weight_idx0); + vfloat weight_val1 = gatherf_byte_inds(weights, weight_idx1); + vfloat weight_val2 = gatherf_byte_inds(weights, weight_idx2); + vfloat weight_val3 = gatherf_byte_inds(weights, weight_idx3); // Load the weight contribution factors for each decimated weight vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index); @@ -81,12 +81,12 @@ static vfloat bilinear_infill_vla_2( unsigned int index ) { // Load the bilinear filter texel weight indexes in the decimated grid - vint weight_idx0 = vint(di.texel_weights_tr[0] + index); - vint weight_idx1 = vint(di.texel_weights_tr[1] + index); + const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index; + const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index; // Load the bilinear filter weights from the decimated grid - vfloat weight_val0 = gatherf(weights, weight_idx0); - vfloat weight_val1 = gatherf(weights, weight_idx1); + vfloat weight_val0 = gatherf_byte_inds(weights, weight_idx0); + vfloat weight_val1 = gatherf_byte_inds(weights, weight_idx1); // Load the weight contribution factors for each decimated weight vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index); @@ -894,18 +894,18 @@ void compute_ideal_weights_for_decimation( for (unsigned int j = 0; j < max_texel_count; j++) { - vint texel(di.weight_texels_tr[j] + i); + const uint8_t* texel = di.weight_texels_tr[j] + i; vfloat weight = loada(di.weights_texel_contribs_tr[j] + i); if (!constant_wes) { - weight_error_scale = gatherf(ei.weight_error_scale, texel); + weight_error_scale = gatherf_byte_inds(ei.weight_error_scale, texel); } vfloat contrib_weight = weight * weight_error_scale; weight_weight += contrib_weight; - initial_weight += gatherf(ei.weights, texel) * contrib_weight; + initial_weight += gatherf_byte_inds(ei.weights, texel) * contrib_weight; } storea(initial_weight / weight_weight, dec_weight_ideal_value + i); @@ -952,17 +952,17 @@ void compute_ideal_weights_for_decimation( for (unsigned int j = 0; j < max_texel_count; j++) { - vint texel(di.weight_texels_tr[j] + i); + const uint8_t* texel = di.weight_texels_tr[j] + i; vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i); if (!constant_wes) { - weight_error_scale = gatherf(ei.weight_error_scale, texel); + weight_error_scale = gatherf_byte_inds(ei.weight_error_scale, texel); } vfloat scale = weight_error_scale * contrib_weight; - vfloat old_weight = gatherf(infilled_weights, texel); - vfloat ideal_weight = gatherf(ei.weights, texel); + vfloat old_weight = gatherf_byte_inds(infilled_weights, texel); + vfloat ideal_weight = gatherf_byte_inds(ei.weights, texel); error_change0 += contrib_weight * scale; error_change1 += (old_weight - ideal_weight) * scale; diff --git a/Source/astcenc_mathlib.h b/Source/astcenc_mathlib.h index f69015394..1d73bf1d2 100644 --- a/Source/astcenc_mathlib.h +++ b/Source/astcenc_mathlib.h @@ -58,8 +58,10 @@ #ifndef ASTCENC_AVX #if defined(__AVX2__) #define ASTCENC_AVX 2 + #define ASTCENC_X86_GATHERS 1 #elif defined(__AVX__) #define ASTCENC_AVX 1 + #define ASTCENC_X86_GATHERS 1 #else #define ASTCENC_AVX 0 #endif diff --git a/Source/astcenc_pick_best_endpoint_format.cpp b/Source/astcenc_pick_best_endpoint_format.cpp index 6e41005bc..bf872a924 100644 --- a/Source/astcenc_pick_best_endpoint_format.cpp +++ b/Source/astcenc_pick_best_endpoint_format.cpp @@ -123,21 +123,21 @@ static void compute_error_squared_rgb_single_partition( vint lane_ids = vint::lane_id(); for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) { - vint tix(texel_indexes + i); + const uint8_t* tix = texel_indexes + i; vmask mask = lane_ids < vint(texel_count); lane_ids += vint(ASTCENC_SIMD_WIDTH); // Compute the error that arises from just ditching alpha - vfloat data_a = gatherf(blk.data_a, tix); + vfloat data_a = gatherf_byte_inds(blk.data_a, tix); vfloat alpha_diff = data_a - default_a; alpha_diff = alpha_diff * alpha_diff; haccumulate(a_drop_errv, alpha_diff, mask); - vfloat data_r = gatherf(blk.data_r, tix); - vfloat data_g = gatherf(blk.data_g, tix); - vfloat data_b = gatherf(blk.data_b, tix); + vfloat data_r = gatherf_byte_inds(blk.data_r, tix); + vfloat data_g = gatherf_byte_inds(blk.data_g, tix); + vfloat data_b = gatherf_byte_inds(blk.data_b, tix); // Compute uncorrelated error vfloat param = data_r * uncor_bs0 diff --git a/Source/astcenc_vecmathlib.h b/Source/astcenc_vecmathlib.h index b41f6fa3a..e6ae97cc4 100644 --- a/Source/astcenc_vecmathlib.h +++ b/Source/astcenc_vecmathlib.h @@ -77,6 +77,8 @@ #define ASTCENC_NO_INLINE __attribute__ ((noinline)) #endif +template T gatherf_byte_inds(const float* base, const uint8_t* indices); + #if ASTCENC_AVX >= 2 // If we have AVX2 expose 8-wide VLA. #include "astcenc_vecmathlib_sse_4.h" diff --git a/Source/astcenc_vecmathlib_avx2_8.h b/Source/astcenc_vecmathlib_avx2_8.h index 9b84ef7a1..b400b313b 100644 --- a/Source/astcenc_vecmathlib_avx2_8.h +++ b/Source/astcenc_vecmathlib_avx2_8.h @@ -903,6 +903,33 @@ ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices) return vfloat8(_mm256_i32gather_ps(base, indices.m, 4)); } +/** + * @brief Load a vector of gathered results from an array using byte indices from memory + */ +template<> +ASTCENC_SIMD_INLINE vfloat8 gatherf_byte_inds(const float* base, const uint8_t* indices) +{ +#if ASTCENC_X86_GATHERS == 0 + // Perform manual gather using scalar loads in two separate dependency chains, + // then merge late. MSVC translates this 1:1, which is OK. Clang turns it + // into a bunch of memory-operand inserts on 128-bit halves then merges late, + // which performs significantly worse in tests. + __m256 m0 = _mm256_broadcast_ss(base + indices[0]); + __m256 m1 = _mm256_broadcast_ss(base + indices[1]); + m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[2]), 1 << 2); + m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[3]), 1 << 3); + m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[4]), 1 << 4); + m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[5]), 1 << 5); + m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[6]), 1 << 6); + m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[7]), 1 << 7); + + return vfloat8(_mm256_blend_ps(m0, m1, 0xaa)); +#else + vint8 inds(indices); + return gatherf(base, inds); +#endif +} + /** * @brief Store a vector to an unaligned memory address. */ diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h index c7ff01289..9142a3286 100644 --- a/Source/astcenc_vecmathlib_neon_4.h +++ b/Source/astcenc_vecmathlib_neon_4.h @@ -828,6 +828,25 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices) #endif } +/** + * @brief Load a vector of gathered results from an array using byte indices from memory + */ +template<> +ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds(const float* base, const uint8_t* indices) +{ +#if ASTCENC_SVE == 0 + alignas(16) float vals[4]; + vals[0] = base[indices[0]]; + vals[1] = base[indices[1]]; + vals[2] = base[indices[2]]; + vals[3] = base[indices[3]]; + return vfloat4(vals); +#else + svint32_t offsets = svld1ub_s32(svptrue_pat_b32(SV_VL4), indices); + svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets); + return vfloat4(svget_neonq_f32(data)); +#endif +} /** * @brief Store a vector to an unaligned memory address. */ diff --git a/Source/astcenc_vecmathlib_none_4.h b/Source/astcenc_vecmathlib_none_4.h index 4646e84ad..862f592a4 100644 --- a/Source/astcenc_vecmathlib_none_4.h +++ b/Source/astcenc_vecmathlib_none_4.h @@ -943,6 +943,18 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices) base[indices.m[3]]); } +/** + * @brief Load a vector of gathered results from an array using byte indices from memory + */ +template<> +ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds(const float* base, const uint8_t* indices) +{ + return vfloat4(base[indices[0]], + base[indices[1]], + base[indices[2]], + base[indices[3]]); +} + /** * @brief Store a vector to an unaligned memory address. */ diff --git a/Source/astcenc_vecmathlib_sse_4.h b/Source/astcenc_vecmathlib_sse_4.h index 0c42c73dc..938ead66e 100644 --- a/Source/astcenc_vecmathlib_sse_4.h +++ b/Source/astcenc_vecmathlib_sse_4.h @@ -900,7 +900,7 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond) */ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices) { -#if ASTCENC_AVX >= 2 +#if ASTCENC_AVX >= 2 && ASTCENC_X86_GATHERS != 0 return vfloat4(_mm_i32gather_ps(base, indices.m, 4)); #else alignas(16) int idx[4]; @@ -909,6 +909,23 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices) #endif } +/** + * @brief Load a vector of gathered results from an array using byte indices from memory + */ +template<> +ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds(const float* base, const uint8_t* indices) +{ + // Experimentally, in this particular use case (byte indices in memory), + // using 4 separate scalar loads is appreciably faster than using gathers + // even if they're available, on every x86 uArch tried, so always do the + // separate loads even when ASTCENC_X86_GATHERS is enabled. + // + // Tested on: + // - Intel Skylake-X, Coffee Lake, Crestmont, Redwood Cove + // - AMD Zen 2, Zen 4 + return vfloat4(base[indices[0]], base[indices[1]], base[indices[2]], base[indices[3]]); +} + /** * @brief Store a vector to an unaligned memory address. */ diff --git a/Source/astcenc_vecmathlib_sve_8.h b/Source/astcenc_vecmathlib_sve_8.h index df4c7f434..1e98df02b 100644 --- a/Source/astcenc_vecmathlib_sve_8.h +++ b/Source/astcenc_vecmathlib_sve_8.h @@ -841,6 +841,16 @@ ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices) return vfloat8(svld1_gather_s32index_f32(svptrue_b32(), base, indices.m)); } +/** + * @brief Load a vector of gathered results from an array using byte indices from memory + */ +template<> +ASTCENC_SIMD_INLINE vfloat8 gatherf_byte_inds(const float* base, const uint8_t* indices) +{ + svint32_t offsets = svld1ub_s32(svptrue_b32(), indices); + return vfloat8(svld1_gather_s32index_f32(svptrue_b32(), base, offsets)); +} + /** * @brief Store a vector to an unaligned memory address. */ diff --git a/Source/cmake_core.cmake b/Source/cmake_core.cmake index 7442d7903..59772023f 100644 --- a/Source/cmake_core.cmake +++ b/Source/cmake_core.cmake @@ -359,7 +359,8 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_VENEER_TYPE) ASTCENC_SSE=20 ASTCENC_AVX=0 ASTCENC_POPCNT=0 - ASTCENC_F16C=0) + ASTCENC_F16C=0 + ASTCENC_X86_GATHERS=0) # Force SSE2 on AppleClang (normally SSE4.1 is the default) target_compile_options(${ASTCENC_TARGET_NAME} @@ -377,7 +378,8 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_VENEER_TYPE) ASTCENC_SSE=41 ASTCENC_AVX=0 ASTCENC_POPCNT=1 - ASTCENC_F16C=0) + ASTCENC_F16C=0 + ASTCENC_X86_GATHERS=0) if (${ASTCENC_VENEER_TYPE} GREATER 0) # Force SSE2 on AppleClang (normally SSE4.1 is the default) @@ -395,12 +397,16 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_VENEER_TYPE) endif() elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2") + # Gathers are quite slow on many x86 microarchitectures, to the point where + # it can be significantly faster to just avoid them use scalar loads. + target_compile_definitions(${ASTCENC_TARGET_NAME} PRIVATE ASTCENC_NEON=0 ASTCENC_SVE=0 ASTCENC_SSE=41 ASTCENC_AVX=2 + ASTCENC_X86_GATHERS=$ ASTCENC_POPCNT=1 ASTCENC_F16C=1)