Skip to content

Commit

Permalink
Add gatherf_byte_inds for gathers using byte indices from memory (#511)
Browse files Browse the repository at this point in the history
Adds a new function to wrap gathers using byte indices from 
memory, avoiding the byte-to-int conversion for ISAs that don't
have native gathers. 

Adds a new build option ASTCENC_X86_GATHERS (default ON) to
allow builds to disable use of native gathers on X86 as they are 
much slower than scalar fallbacks on some microarchitectures 
(AMD Zen, pre-Skylake Intel).

Co-authored-by: Fabian Giesen <[email protected]>
Co-authored-by: Pete Harris <[email protected]>
  • Loading branch information
3 people authored Nov 4, 2024
1 parent 521179c commit 546f9dd
Show file tree
Hide file tree
Showing 13 changed files with 144 additions and 36 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ option(ASTCENC_UBSAN "Enable astcenc builds with undefined behavior sanitizer")
option(ASTCENC_UNITTEST "Enable astcenc builds with unit tests")
option(ASTCENC_INVARIANCE "Enable astcenc floating point invariance" ON)
option(ASTCENC_CLI "Enable build of astcenc command line tools" ON)
option(ASTCENC_X86_GATHERS "Enable use of native x86 gathers" ON)

# Preflight for some macOS-specific build options
if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
Expand Down Expand Up @@ -127,6 +128,7 @@ message(STATUS "x86-64 backend options")
printopt("AVX2 backend " ${ASTCENC_ISA_AVX2})
printopt("SSE4.1 backend " ${ASTCENC_ISA_SSE41})
printopt("SSE2 backend " ${ASTCENC_ISA_SSE2})
printopt("Use native gathers " ${ASTCENC_X86_GATHERS})
message(STATUS "Agnostic backend options")
printopt("NONE backend " ${ASTCENC_ISA_NONE})
printopt("NATIVE backend " ${ASTCENC_ISA_NATIVE})
Expand Down
11 changes: 11 additions & 0 deletions Docs/Building.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,17 @@ To enable this binary variant add `-DASTCENC_ISA_NONE=ON` to the CMake command
line when configuring. It is NOT recommended to use this for production; it is
significantly slower than the vectorized SIMD builds.

### No x86 gather instruction builds

On many x86 microarchitectures the native AVX gather instructions are slower
than simply performing manual scalar loads and combining the results. Gathers
are enabled by default, but can be disabled by setting the CMake option
`-DASTCENC_X86_GATHERS=OFF` on the command line when configuring.

Note that we have seen mixed results when compiling the scalar fallback path,
so we would recommend testing which option works best for the compiler and
microarchitecture pairing that you are targeting.

### Test builds

We support building unit tests. These use the `googletest` framework, which is
Expand Down
18 changes: 9 additions & 9 deletions Source/astcenc_averages_and_directions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -778,12 +778,12 @@ void compute_error_squared_rgba(
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vmask mask = lane_ids < vint(texel_count);
vint texel_idxs(texel_indexes + i);
const uint8_t* texel_idxs = texel_indexes + i;

vfloat data_r = gatherf(blk.data_r, texel_idxs);
vfloat data_g = gatherf(blk.data_g, texel_idxs);
vfloat data_b = gatherf(blk.data_b, texel_idxs);
vfloat data_a = gatherf(blk.data_a, texel_idxs);
vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, texel_idxs);

vfloat uncor_param = (data_r * l_uncor_bs0)
+ (data_g * l_uncor_bs1)
Expand Down Expand Up @@ -892,11 +892,11 @@ void compute_error_squared_rgb(
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vmask mask = lane_ids < vint(texel_count);
vint texel_idxs(texel_indexes + i);
const uint8_t* texel_idxs = texel_indexes + i;

vfloat data_r = gatherf(blk.data_r, texel_idxs);
vfloat data_g = gatherf(blk.data_g, texel_idxs);
vfloat data_b = gatherf(blk.data_b, texel_idxs);
vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);

vfloat uncor_param = (data_r * l_uncor_bs0)
+ (data_g * l_uncor_bs1)
Expand Down
38 changes: 19 additions & 19 deletions Source/astcenc_ideal_endpoints_and_weights.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,16 @@ static vfloat bilinear_infill_vla(
unsigned int index
) {
// Load the bilinear filter texel weight indexes in the decimated grid
vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
vint weight_idx2 = vint(di.texel_weights_tr[2] + index);
vint weight_idx3 = vint(di.texel_weights_tr[3] + index);
const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index;
const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index;
const uint8_t* weight_idx2 = di.texel_weights_tr[2] + index;
const uint8_t* weight_idx3 = di.texel_weights_tr[3] + index;

// Load the bilinear filter weights from the decimated grid
vfloat weight_val0 = gatherf(weights, weight_idx0);
vfloat weight_val1 = gatherf(weights, weight_idx1);
vfloat weight_val2 = gatherf(weights, weight_idx2);
vfloat weight_val3 = gatherf(weights, weight_idx3);
vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0);
vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1);
vfloat weight_val2 = gatherf_byte_inds<vfloat>(weights, weight_idx2);
vfloat weight_val3 = gatherf_byte_inds<vfloat>(weights, weight_idx3);

// Load the weight contribution factors for each decimated weight
vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
Expand Down Expand Up @@ -81,12 +81,12 @@ static vfloat bilinear_infill_vla_2(
unsigned int index
) {
// Load the bilinear filter texel weight indexes in the decimated grid
vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index;
const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index;

// Load the bilinear filter weights from the decimated grid
vfloat weight_val0 = gatherf(weights, weight_idx0);
vfloat weight_val1 = gatherf(weights, weight_idx1);
vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0);
vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1);

// Load the weight contribution factors for each decimated weight
vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
Expand Down Expand Up @@ -894,18 +894,18 @@ void compute_ideal_weights_for_decimation(

for (unsigned int j = 0; j < max_texel_count; j++)
{
vint texel(di.weight_texels_tr[j] + i);
const uint8_t* texel = di.weight_texels_tr[j] + i;
vfloat weight = loada(di.weights_texel_contribs_tr[j] + i);

if (!constant_wes)
{
weight_error_scale = gatherf(ei.weight_error_scale, texel);
weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel);
}

vfloat contrib_weight = weight * weight_error_scale;

weight_weight += contrib_weight;
initial_weight += gatherf(ei.weights, texel) * contrib_weight;
initial_weight += gatherf_byte_inds<vfloat>(ei.weights, texel) * contrib_weight;
}

storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
Expand Down Expand Up @@ -952,17 +952,17 @@ void compute_ideal_weights_for_decimation(

for (unsigned int j = 0; j < max_texel_count; j++)
{
vint texel(di.weight_texels_tr[j] + i);
const uint8_t* texel = di.weight_texels_tr[j] + i;
vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i);

if (!constant_wes)
{
weight_error_scale = gatherf(ei.weight_error_scale, texel);
weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel);
}

vfloat scale = weight_error_scale * contrib_weight;
vfloat old_weight = gatherf(infilled_weights, texel);
vfloat ideal_weight = gatherf(ei.weights, texel);
vfloat old_weight = gatherf_byte_inds<vfloat>(infilled_weights, texel);
vfloat ideal_weight = gatherf_byte_inds<vfloat>(ei.weights, texel);

error_change0 += contrib_weight * scale;
error_change1 += (old_weight - ideal_weight) * scale;
Expand Down
2 changes: 2 additions & 0 deletions Source/astcenc_mathlib.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,10 @@
#ifndef ASTCENC_AVX
#if defined(__AVX2__)
#define ASTCENC_AVX 2
#define ASTCENC_X86_GATHERS 1
#elif defined(__AVX__)
#define ASTCENC_AVX 1
#define ASTCENC_X86_GATHERS 1
#else
#define ASTCENC_AVX 0
#endif
Expand Down
10 changes: 5 additions & 5 deletions Source/astcenc_pick_best_endpoint_format.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,21 +123,21 @@ static void compute_error_squared_rgb_single_partition(
vint lane_ids = vint::lane_id();
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vint tix(texel_indexes + i);
const uint8_t* tix = texel_indexes + i;

vmask mask = lane_ids < vint(texel_count);
lane_ids += vint(ASTCENC_SIMD_WIDTH);

// Compute the error that arises from just ditching alpha
vfloat data_a = gatherf(blk.data_a, tix);
vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, tix);
vfloat alpha_diff = data_a - default_a;
alpha_diff = alpha_diff * alpha_diff;

haccumulate(a_drop_errv, alpha_diff, mask);

vfloat data_r = gatherf(blk.data_r, tix);
vfloat data_g = gatherf(blk.data_g, tix);
vfloat data_b = gatherf(blk.data_b, tix);
vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, tix);
vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, tix);
vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, tix);

// Compute uncorrelated error
vfloat param = data_r * uncor_bs0
Expand Down
2 changes: 2 additions & 0 deletions Source/astcenc_vecmathlib.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@
#define ASTCENC_NO_INLINE __attribute__ ((noinline))
#endif

template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indices);

#if ASTCENC_AVX >= 2
// If we have AVX2 expose 8-wide VLA.
#include "astcenc_vecmathlib_sse_4.h"
Expand Down
27 changes: 27 additions & 0 deletions Source/astcenc_vecmathlib_avx2_8.h
Original file line number Diff line number Diff line change
Expand Up @@ -903,6 +903,33 @@ ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
return vfloat8(_mm256_i32gather_ps(base, indices.m, 4));
}

/**
* @brief Load a vector of gathered results from an array using byte indices from memory
*/
template<>
ASTCENC_SIMD_INLINE vfloat8 gatherf_byte_inds<vfloat8>(const float* base, const uint8_t* indices)
{
#if ASTCENC_X86_GATHERS == 0
// Perform manual gather using scalar loads in two separate dependency chains,
// then merge late. MSVC translates this 1:1, which is OK. Clang turns it
// into a bunch of memory-operand inserts on 128-bit halves then merges late,
// which performs significantly worse in tests.
__m256 m0 = _mm256_broadcast_ss(base + indices[0]);
__m256 m1 = _mm256_broadcast_ss(base + indices[1]);
m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[2]), 1 << 2);
m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[3]), 1 << 3);
m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[4]), 1 << 4);
m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[5]), 1 << 5);
m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[6]), 1 << 6);
m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[7]), 1 << 7);

return vfloat8(_mm256_blend_ps(m0, m1, 0xaa));
#else
vint8 inds(indices);
return gatherf(base, inds);
#endif
}

/**
* @brief Store a vector to an unaligned memory address.
*/
Expand Down
19 changes: 19 additions & 0 deletions Source/astcenc_vecmathlib_neon_4.h
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,25 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
#endif
}

/**
* @brief Load a vector of gathered results from an array using byte indices from memory
*/
template<>
ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
{
#if ASTCENC_SVE == 0
alignas(16) float vals[4];
vals[0] = base[indices[0]];
vals[1] = base[indices[1]];
vals[2] = base[indices[2]];
vals[3] = base[indices[3]];
return vfloat4(vals);
#else
svint32_t offsets = svld1ub_s32(svptrue_pat_b32(SV_VL4), indices);
svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets);
return vfloat4(svget_neonq_f32(data));
#endif
}
/**
* @brief Store a vector to an unaligned memory address.
*/
Expand Down
12 changes: 12 additions & 0 deletions Source/astcenc_vecmathlib_none_4.h
Original file line number Diff line number Diff line change
Expand Up @@ -943,6 +943,18 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
base[indices.m[3]]);
}

/**
* @brief Load a vector of gathered results from an array using byte indices from memory
*/
template<>
ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
{
return vfloat4(base[indices[0]],
base[indices[1]],
base[indices[2]],
base[indices[3]]);
}

/**
* @brief Store a vector to an unaligned memory address.
*/
Expand Down
19 changes: 18 additions & 1 deletion Source/astcenc_vecmathlib_sse_4.h
Original file line number Diff line number Diff line change
Expand Up @@ -900,7 +900,7 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
*/
ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
{
#if ASTCENC_AVX >= 2
#if ASTCENC_AVX >= 2 && ASTCENC_X86_GATHERS != 0
return vfloat4(_mm_i32gather_ps(base, indices.m, 4));
#else
alignas(16) int idx[4];
Expand All @@ -909,6 +909,23 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
#endif
}

/**
* @brief Load a vector of gathered results from an array using byte indices from memory
*/
template<>
ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
{
// Experimentally, in this particular use case (byte indices in memory),
// using 4 separate scalar loads is appreciably faster than using gathers
// even if they're available, on every x86 uArch tried, so always do the
// separate loads even when ASTCENC_X86_GATHERS is enabled.
//
// Tested on:
// - Intel Skylake-X, Coffee Lake, Crestmont, Redwood Cove
// - AMD Zen 2, Zen 4
return vfloat4(base[indices[0]], base[indices[1]], base[indices[2]], base[indices[3]]);
}

/**
* @brief Store a vector to an unaligned memory address.
*/
Expand Down
10 changes: 10 additions & 0 deletions Source/astcenc_vecmathlib_sve_8.h
Original file line number Diff line number Diff line change
Expand Up @@ -841,6 +841,16 @@ ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
return vfloat8(svld1_gather_s32index_f32(svptrue_b32(), base, indices.m));
}

/**
* @brief Load a vector of gathered results from an array using byte indices from memory
*/
template<>
ASTCENC_SIMD_INLINE vfloat8 gatherf_byte_inds<vfloat8>(const float* base, const uint8_t* indices)
{
svint32_t offsets = svld1ub_s32(svptrue_b32(), indices);
return vfloat8(svld1_gather_s32index_f32(svptrue_b32(), base, offsets));
}

/**
* @brief Store a vector to an unaligned memory address.
*/
Expand Down
10 changes: 8 additions & 2 deletions Source/cmake_core.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,8 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_VENEER_TYPE)
ASTCENC_SSE=20
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)
ASTCENC_F16C=0
ASTCENC_X86_GATHERS=0)

# Force SSE2 on AppleClang (normally SSE4.1 is the default)
target_compile_options(${ASTCENC_TARGET_NAME}
Expand All @@ -377,7 +378,8 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_VENEER_TYPE)
ASTCENC_SSE=41
ASTCENC_AVX=0
ASTCENC_POPCNT=1
ASTCENC_F16C=0)
ASTCENC_F16C=0
ASTCENC_X86_GATHERS=0)

if (${ASTCENC_VENEER_TYPE} GREATER 0)
# Force SSE2 on AppleClang (normally SSE4.1 is the default)
Expand All @@ -395,12 +397,16 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_VENEER_TYPE)
endif()

elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
# Gathers are quite slow on many x86 microarchitectures, to the point where
# it can be significantly faster to just avoid them use scalar loads.

target_compile_definitions(${ASTCENC_TARGET_NAME}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SVE=0
ASTCENC_SSE=41
ASTCENC_AVX=2
ASTCENC_X86_GATHERS=$<BOOL:${ASTCENC_X86_GATHERS}>
ASTCENC_POPCNT=1
ASTCENC_F16C=1)

Expand Down

0 comments on commit 546f9dd

Please sign in to comment.