From 5af72968024188248339d18cda9f8f31741f8c28 Mon Sep 17 00:00:00 2001 From: Peter Harris Date: Fri, 17 May 2024 11:43:46 +0100 Subject: [PATCH] Add Arm fixed-width 256b SVE vector support. --- CMakeLists.txt | 37 +- Source/CMakeLists.txt | 8 +- Source/UnitTest/CMakeLists.txt | 8 +- Source/UnitTest/cmake_core.cmake | 22 +- Source/UnitTest/test_simd.cpp | 104 +- .../astcenc_ideal_endpoints_and_weights.cpp | 6 +- Source/astcenc_mathlib.h | 8 +- Source/astcenc_pick_best_endpoint_format.cpp | 1 - Source/astcenc_vecmathlib.h | 53 +- Source/astcenc_vecmathlib_avx2_8.h | 13 +- Source/astcenc_vecmathlib_neon_4.h | 25 +- Source/astcenc_vecmathlib_none_4.h | 16 +- Source/astcenc_vecmathlib_sse_4.h | 21 +- Source/astcenc_vecmathlib_sve_8.h | 1089 +++++++++++++++++ Source/astcenccli_entry.cpp | 31 + Source/astcenccli_image.cpp | 3 +- Source/astcenccli_toplevel.cpp | 15 +- Source/astcenccli_toplevel_help.cpp | 4 +- Source/cmake_core.cmake | 22 + Test/astc_test_image.py | 9 +- Test/astc_update_ref.sh | 2 +- 21 files changed, 1317 insertions(+), 180 deletions(-) create mode 100644 Source/astcenc_vecmathlib_sve_8.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 2f633b400..2443c6aaf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,6 +38,7 @@ include(CTest) option(ASTCENC_ISA_AVX2 "Enable astcenc builds for AVX2 SIMD") option(ASTCENC_ISA_SSE41 "Enable astcenc builds for SSE4.1 SIMD") option(ASTCENC_ISA_SSE2 "Enable astcenc builds for SSE2 SIMD") +option(ASTCENC_ISA_SVE_256 "Enable astcenc builds for 256-bit SVE SIMD") option(ASTCENC_ISA_NEON "Enable astcenc builds for NEON SIMD") option(ASTCENC_ISA_NONE "Enable astcenc builds for no SIMD") option(ASTCENC_ISA_NATIVE "Enable astcenc builds for native SIMD") @@ -86,7 +87,7 @@ endforeach() # Count options which MUST be arm64 set(ASTCENC_ARM64_ISA_COUNT 0) -set(ASTCENC_CONFIGS ${ASTCENC_ISA_NEON}) +set(ASTCENC_CONFIGS ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_SVE_256}) foreach(ASTCENC_CONFIG ${ASTCENC_CONFIGS}) if(${ASTCENC_CONFIG}) math(EXPR ASTCENC_ARM64_ISA_COUNT "${ASTCENC_ARM64_ISA_COUNT} + 1") @@ -117,22 +118,28 @@ if("${ASTCENC_BLOCK_MAX_TEXELS}") message(STATUS " Max block texels - ${ASTCENC_BLOCK_MAX_TEXELS}") endif() -printopt("AVX2 backend " ${ASTCENC_ISA_AVX2}) -printopt("SSE4.1 backend " ${ASTCENC_ISA_SSE41}) -printopt("SSE2 backend " ${ASTCENC_ISA_SSE2}) -printopt("NEON backend " ${ASTCENC_ISA_NEON}) -printopt("NONE backend " ${ASTCENC_ISA_NONE}) -printopt("NATIVE backend " ${ASTCENC_ISA_NATIVE}) +message(STATUS "Arm backend options") +printopt("SVE 256b backend " ${ASTCENC_ISA_SVE_256}) +printopt("NEON backend " ${ASTCENC_ISA_NEON}) +message(STATUS "x86-64 backend options") +printopt("AVX2 backend " ${ASTCENC_ISA_AVX2}) +printopt("SSE4.1 backend " ${ASTCENC_ISA_SSE41}) +printopt("SSE2 backend " ${ASTCENC_ISA_SSE2}) +message(STATUS "Agnostic backend options") +printopt("NONE backend " ${ASTCENC_ISA_NONE}) +printopt("NATIVE backend " ${ASTCENC_ISA_NATIVE}) +message(STATUS "Build options") if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin") - printopt("Universal bin " ${ASTCENC_UNIVERSAL_BUILD}) + printopt("Universal bin " ${ASTCENC_UNIVERSAL_BUILD}) endif() -printopt("Invariance " ${ASTCENC_INVARIANCE}) -printopt("Shared libs " ${ASTCENC_SHAREDLIB}) -printopt("Decompressor " ${ASTCENC_DECOMPRESSOR}) -printopt("Diagnostics " ${ASTCENC_DIAGNOSTICS}) -printopt("ASAN " ${ASTCENC_ASAN}) -printopt("UBSAN " ${ASTCENC_UBSAN}) -printopt("Unit tests " ${ASTCENC_UNITTEST}) +printopt("Invariance " ${ASTCENC_INVARIANCE}) +printopt("Shared libs " ${ASTCENC_SHAREDLIB}) +printopt("Decompressor " ${ASTCENC_DECOMPRESSOR}) +message(STATUS "Developer options") +printopt("Diagnostics " ${ASTCENC_DIAGNOSTICS}) +printopt("ASAN " ${ASTCENC_ASAN}) +printopt("UBSAN " ${ASTCENC_UBSAN}) +printopt("Unit tests " ${ASTCENC_UNITTEST}) # Subcomponents add_subdirectory(Source) diff --git a/Source/CMakeLists.txt b/Source/CMakeLists.txt index cd00c377c..dd04b23f4 100644 --- a/Source/CMakeLists.txt +++ b/Source/CMakeLists.txt @@ -27,8 +27,8 @@ else() set(ASTCENC_CODEC enc) endif() -set(ASTCENC_ARTIFACTS native none neon avx2 sse4.1 sse2) -set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2}) +set(ASTCENC_ARTIFACTS native none sve_256 neon avx2 sse4.1 sse2) +set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_SVE_256} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2}) list(LENGTH ASTCENC_ARTIFACTS ASTCENC_ARTIFACTS_LEN) math(EXPR ASTCENC_ARTIFACTS_LEN "${ASTCENC_ARTIFACTS_LEN} - 1") @@ -38,7 +38,9 @@ foreach(INDEX RANGE ${ASTCENC_ARTIFACTS_LEN}) if(${ASTCENC_CONFIG}) set(ASTCENC_ISA_SIMD ${ASTCENC_ARTIFACT}) - if(${ASTCENC_ISA_SIMD} MATCHES "neon") + if(${ASTCENC_ISA_SIMD} MATCHES "sve_256") + # Not suported on macOS + elseif(${ASTCENC_ISA_SIMD} MATCHES "neon") set(CMAKE_OSX_ARCHITECTURES arm64) elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2") set(CMAKE_OSX_ARCHITECTURES x86_64) diff --git a/Source/UnitTest/CMakeLists.txt b/Source/UnitTest/CMakeLists.txt index 2f633c07a..dc4bacad6 100644 --- a/Source/UnitTest/CMakeLists.txt +++ b/Source/UnitTest/CMakeLists.txt @@ -15,8 +15,8 @@ # under the License. # ---------------------------------------------------------------------------- -set(ASTCENC_ARTIFACTS native none neon avx2 sse4.1 sse2) -set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2}) +set(ASTCENC_ARTIFACTS native none sve_256 neon avx2 sse4.1 sse2) +set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_SVE_256} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2}) list(LENGTH ASTCENC_ARTIFACTS ASTCENC_ARTIFACTS_LEN) math(EXPR ASTCENC_ARTIFACTS_LEN "${ASTCENC_ARTIFACTS_LEN} - 1") @@ -26,7 +26,9 @@ foreach(INDEX RANGE ${ASTCENC_ARTIFACTS_LEN}) if(${ASTCENC_CONFIG}) set(ASTCENC_ISA_SIMD ${ASTCENC_ARTIFACT}) - if(${ASTCENC_ISA_SIMD} MATCHES "neon") + if(${ASTCENC_ISA_SIMD} MATCHES "sve_256") + # Not supported on macOS + elseif(${ASTCENC_ISA_SIMD} MATCHES "neon") set(CMAKE_OSX_ARCHITECTURES arm64) elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2") set(CMAKE_OSX_ARCHITECTURES x86_64) diff --git a/Source/UnitTest/cmake_core.cmake b/Source/UnitTest/cmake_core.cmake index 2fb7a4d70..2fd20979c 100644 --- a/Source/UnitTest/cmake_core.cmake +++ b/Source/UnitTest/cmake_core.cmake @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # ---------------------------------------------------------------------------- -# Copyright 2020-2023 Arm Limited +# Copyright 2020-2024 Arm Limited # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy @@ -72,6 +72,7 @@ if(${ASTCENC_ISA_SIMD} MATCHES "none") target_compile_definitions(${ASTCENC_TEST} PRIVATE ASTCENC_NEON=0 + ASTCENC_SVE=0 ASTCENC_SSE=0 ASTCENC_AVX=0 ASTCENC_POPCNT=0 @@ -81,15 +82,32 @@ elseif(${ASTCENC_ISA_SIMD} MATCHES "neon") target_compile_definitions(${ASTCENC_TEST} PRIVATE ASTCENC_NEON=1 + ASTCENC_SVE=0 ASTCENC_SSE=0 ASTCENC_AVX=0 ASTCENC_POPCNT=0 ASTCENC_F16C=0) +elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_256") + target_compile_definitions(${ASTCENC_TEST} + PRIVATE + ASTCENC_NEON=1 + ASTCENC_SVE=8 + ASTCENC_SSE=0 + ASTCENC_AVX=0 + ASTCENC_POPCNT=0 + ASTCENC_F16C=0) + + # Enable SVE + target_compile_options(${ASTCENC_TEST} + PRIVATE + -march=armv8-a+sve -msve-vector-bits=256) + elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2") target_compile_definitions(${ASTCENC_TEST} PRIVATE ASTCENC_NEON=0 + ASTCENC_SVE=0 ASTCENC_SSE=20 ASTCENC_AVX=0 ASTCENC_POPCNT=0 @@ -103,6 +121,7 @@ elseif(${ASTCENC_ISA_SIMD} MATCHES "sse4.1") target_compile_definitions(${ASTCENC_TEST} PRIVATE ASTCENC_NEON=0 + ASTCENC_SVE=0 ASTCENC_SSE=41 ASTCENC_AVX=0 ASTCENC_POPCNT=1 @@ -116,6 +135,7 @@ elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2") target_compile_definitions(${ASTCENC_TEST} PRIVATE ASTCENC_NEON=0 + ASTCENC_SVE=0 ASTCENC_SSE=41 ASTCENC_AVX=2 ASTCENC_POPCNT=1 diff --git a/Source/UnitTest/test_simd.cpp b/Source/UnitTest/test_simd.cpp index f86403d0f..f857c3550 100644 --- a/Source/UnitTest/test_simd.cpp +++ b/Source/UnitTest/test_simd.cpp @@ -194,6 +194,7 @@ TEST(vfloat, ChangeSign) vfloat4 a(-1.0f, 1.0f, -3.12f, 3.12f); vfloat4 b(-1.0f, -1.0f, 3.12f, 3.12f); vfloat4 r = change_sign(a, b); + EXPECT_EQ(r.lane<0>(), 1.0f); EXPECT_EQ(r.lane<1>(), -1.0f); EXPECT_EQ(r.lane<2>(), -3.12f); @@ -205,6 +206,7 @@ TEST(vfloat, Atan) { vfloat4 a(-0.15f, 0.0f, 0.9f, 2.1f); vfloat4 r = atan(a); + EXPECT_NEAR(r.lane<0>(), -0.149061f, 0.005f); EXPECT_NEAR(r.lane<1>(), 0.000000f, 0.005f); EXPECT_NEAR(r.lane<2>(), 0.733616f, 0.005f); @@ -217,6 +219,7 @@ TEST(vfloat, Atan2) vfloat4 a(-0.15f, 0.0f, 0.9f, 2.1f); vfloat4 b(1.15f, -3.0f, -0.9f, 1.1f); vfloat4 r = atan2(a, b); + EXPECT_NEAR(r.lane<0>(), -0.129816f, 0.005f); EXPECT_NEAR(r.lane<1>(), 3.141592f, 0.005f); EXPECT_NEAR(r.lane<2>(), 2.360342f, 0.005f); @@ -909,31 +912,6 @@ TEST(vfloat4, select) EXPECT_EQ(r2.lane<3>(), 4.0f); } -/** @brief Test vfloat4 select MSB only. */ -TEST(vfloat4, select_msb) -{ - int msb_set = static_cast(0x80000000); - vint4 msb(msb_set, 0, msb_set, 0); - vmask4 cond(msb.m); - - vfloat4 a(1.0f, 3.0f, 3.0f, 1.0f); - vfloat4 b(4.0f, 2.0f, 2.0f, 4.0f); - - // Select in one direction - vfloat4 r1 = select_msb(a, b, cond); - EXPECT_EQ(r1.lane<0>(), 4.0f); - EXPECT_EQ(r1.lane<1>(), 3.0f); - EXPECT_EQ(r1.lane<2>(), 2.0f); - EXPECT_EQ(r1.lane<3>(), 1.0f); - - // Select in the other - vfloat4 r2 = select_msb(b, a, cond); - EXPECT_EQ(r2.lane<0>(), 1.0f); - EXPECT_EQ(r2.lane<1>(), 2.0f); - EXPECT_EQ(r2.lane<2>(), 3.0f); - EXPECT_EQ(r2.lane<3>(), 4.0f); -} - /** @brief Test vfloat4 gatherf. */ TEST(vfloat4, gatherf) { @@ -1839,12 +1817,17 @@ TEST(vint4, store_lanes_masked_unaligned) EXPECT_TRUE(all(result3v == expect3v)); } -/** @brief Test vint4 pack_low_bytes. */ -TEST(vint4, pack_low_bytes) +/** @brief Test vint4 pack_and_store_low_bytes. */ +TEST(vint4, pack_and_store_low_bytes) { vint4 a(1, 2, 3, 4); - vint4 r = pack_low_bytes(a); - EXPECT_EQ(r.lane<0>(), (4 << 24) | (3 << 16) | (2 << 8) | (1 << 0)); + uint8_t bytes[4] { 0 }; + pack_and_store_low_bytes(a, bytes); + + EXPECT_EQ(bytes[0], 1); + EXPECT_EQ(bytes[1], 2); + EXPECT_EQ(bytes[2], 3); + EXPECT_EQ(bytes[3], 4); } /** @brief Test vint4 select. */ @@ -2711,46 +2694,6 @@ TEST(vfloat8, select) EXPECT_EQ(ra[7], 4.0f); } -/** @brief Test vfloat8 select MSB only. */ -TEST(vfloat8, select_msb) -{ - int msb_set = static_cast(0x80000000); - vint8 msb = vint8_lit(msb_set, 0, msb_set, 0, msb_set, 0, msb_set, 0); - vmask8 cond(msb.m); - - vfloat8 a = vfloat8_lit(1.0f, 3.0f, 3.0f, 1.0f, 1.0f, 3.0f, 3.0f, 1.0f); - vfloat8 b = vfloat8_lit(4.0f, 2.0f, 2.0f, 4.0f, 4.0f, 2.0f, 2.0f, 4.0f); - - // Select in one direction - vfloat8 r1 = select(a, b, cond); - - alignas(32) float ra[8]; - storea(r1, ra); - - EXPECT_EQ(ra[0], 4.0f); - EXPECT_EQ(ra[1], 3.0f); - EXPECT_EQ(ra[2], 2.0f); - EXPECT_EQ(ra[3], 1.0f); - EXPECT_EQ(ra[4], 4.0f); - EXPECT_EQ(ra[5], 3.0f); - EXPECT_EQ(ra[6], 2.0f); - EXPECT_EQ(ra[7], 1.0f); - - // Select in the other - vfloat8 r2 = select(b, a, cond); - - storea(r2, ra); - - EXPECT_EQ(ra[0], 1.0f); - EXPECT_EQ(ra[1], 2.0f); - EXPECT_EQ(ra[2], 3.0f); - EXPECT_EQ(ra[3], 4.0f); - EXPECT_EQ(ra[4], 1.0f); - EXPECT_EQ(ra[5], 2.0f); - EXPECT_EQ(ra[6], 3.0f); - EXPECT_EQ(ra[7], 4.0f); -} - /** @brief Test vfloat8 gatherf. */ TEST(vfloat8, gatherf) { @@ -3583,17 +3526,22 @@ TEST(vint8, store_lanes_masked_unaligned) EXPECT_TRUE(all(result3v == expect3v)); } -/** @brief Test vint8 pack_low_bytes. */ -TEST(vint8, pack_low_bytes) +/** @brief Test vint8 pack_and_store_low_bytes. */ +TEST(vint8, pack_and_store_low_bytes) { vint8 a = vint8_lit(1, 2, 3, 4, 2, 3, 4, 5); - vint8 r = pack_low_bytes(a); - - alignas(32) int ra[8]; - store(r, ra); - - EXPECT_EQ(ra[0], (4 << 24) | (3 << 16) | (2 << 8) | (1 << 0)); - EXPECT_EQ(ra[1], (5 << 24) | (4 << 16) | (3 << 8) | (2 << 0)); + uint8_t bytes[8] { 0 }; + + pack_and_store_low_bytes(a, bytes); + + EXPECT_EQ(bytes[0], 1); + EXPECT_EQ(bytes[1], 2); + EXPECT_EQ(bytes[2], 3); + EXPECT_EQ(bytes[3], 4); + EXPECT_EQ(bytes[4], 2); + EXPECT_EQ(bytes[5], 3); + EXPECT_EQ(bytes[6], 4); + EXPECT_EQ(bytes[7], 5); } /** @brief Test vint8 select. */ diff --git a/Source/astcenc_ideal_endpoints_and_weights.cpp b/Source/astcenc_ideal_endpoints_and_weights.cpp index 9343a0abb..3442464d5 100644 --- a/Source/astcenc_ideal_endpoints_and_weights.cpp +++ b/Source/astcenc_ideal_endpoints_and_weights.cpp @@ -1050,8 +1050,7 @@ void compute_quantized_weights_for_decimation( // Invert the weight-scaling that was done initially storea(ixl * rscalev + low_boundv, weight_set_out + i); - vint scn = pack_low_bytes(weight); - store_nbytes(scn, quantized_weight_set + i); + pack_and_store_low_bytes(weight, quantized_weight_set + i); } } else @@ -1084,8 +1083,7 @@ void compute_quantized_weights_for_decimation( // Invert the weight-scaling that was done initially storea(ixl * rscalev + low_boundv, weight_set_out + i); - vint scn = pack_low_bytes(weight); - store_nbytes(scn, quantized_weight_set + i); + pack_and_store_low_bytes(weight, quantized_weight_set + i); } } } diff --git a/Source/astcenc_mathlib.h b/Source/astcenc_mathlib.h index 562d6597f..959f9ba99 100644 --- a/Source/astcenc_mathlib.h +++ b/Source/astcenc_mathlib.h @@ -73,10 +73,14 @@ #endif #endif +#ifndef ASTCENC_SVE + #define ASTCENC_SVE 0 +#endif + // Force vector-sized SIMD alignment -#if ASTCENC_AVX +#if ASTCENC_AVX || ASTCENC_SVE == 8 #define ASTCENC_VECALIGN 32 -#elif ASTCENC_SSE || ASTCENC_NEON +#elif ASTCENC_SSE || ASTCENC_NEON || ASTCENC_SVE == 4 #define ASTCENC_VECALIGN 16 // Use default alignment for non-SIMD builds #else diff --git a/Source/astcenc_pick_best_endpoint_format.cpp b/Source/astcenc_pick_best_endpoint_format.cpp index 6e41005bc..ccf9935fe 100644 --- a/Source/astcenc_pick_best_endpoint_format.cpp +++ b/Source/astcenc_pick_best_endpoint_format.cpp @@ -1308,7 +1308,6 @@ unsigned int compute_ideal_endpoint_formats( vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error); int best_error_index = hmin_s(vbest_error_index); - best_error_weights[i] = best_error_index; // Max the error for this candidate so we don't pick it again diff --git a/Source/astcenc_vecmathlib.h b/Source/astcenc_vecmathlib.h index d48f1d73e..ebdcb7e5e 100644 --- a/Source/astcenc_vecmathlib.h +++ b/Source/astcenc_vecmathlib.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2019-2022 Arm Limited +// Copyright 2019-2024 Arm Limited // Copyright 2008 Jose Fonseca // // Licensed under the Apache License, Version 2.0 (the "License"); you may not @@ -42,11 +42,12 @@ * * With the current implementation ISA support is provided for: * - * * 1-wide for scalar reference. - * * 4-wide for Armv8-A NEON. - * * 4-wide for x86-64 SSE2. - * * 4-wide for x86-64 SSE4.1. - * * 8-wide for x86-64 AVX2. + * * 1-wide for scalar reference + * * 4-wide for Armv8-A NEON + * * 4-wide for x86-64 SSE2 + * * 4-wide for x86-64 SSE4.1 + * * 8-wide for Armv8-A SVE + * * 8-wide for x86-64 AVX2 */ #ifndef ASTC_VECMATHLIB_H_INCLUDED @@ -54,7 +55,14 @@ #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 #include -#elif ASTCENC_NEON != 0 +#endif + +#if ASTCENC_SVE != 0 + #include + #include +#endif + +#if ASTCENC_NEON != 0 #include #endif @@ -106,6 +114,33 @@ constexpr auto loada = vfloat4::loada; constexpr auto load1 = vfloat4::load1; +#elif ASTCENC_SVE == 8 + /* If we have SVE configured as 8-wide, expose 8-wide VLA. */ + #include "astcenc_vecmathlib_neon_4.h" + #include "astcenc_vecmathlib_common_4.h" + #include "astcenc_vecmathlib_sve_8.h" + + /* Check the compiler is treating SVE as 256 bits ... */ + #if __ARM_FEATURE_SVE_BITS != 256 + #error "__ARM_FEATURE_SVE_BITS is not 256 bits" + #endif + + #define ASTCENC_SIMD_WIDTH 8 + + using vfloat = vfloat8; + + #if defined(ASTCENC_NO_INVARIANCE) + using vfloatacc = vfloat8; + #else + using vfloatacc = vfloat4; + #endif + + using vint = vint8; + using vmask = vmask8; + + constexpr auto loada = vfloat8::loada; + constexpr auto load1 = vfloat8::load1; + #elif ASTCENC_NEON > 0 /* If we have NEON expose 4-wide VLA. */ #include "astcenc_vecmathlib_neon_4.h" @@ -239,8 +274,8 @@ ASTCENC_SIMD_INLINE vfloat atan(vfloat x) ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x) { vfloat z = atan(abs(y / x)); - vmask xmask = vmask(float_as_int(x).m); - return change_sign(select_msb(z, vfloat(astc::PI) - z, xmask), y); + vmask xmask = x < vfloat::zero(); + return change_sign(select(z, vfloat(astc::PI) - z, xmask), y); } /* diff --git a/Source/astcenc_vecmathlib_avx2_8.h b/Source/astcenc_vecmathlib_avx2_8.h index 4ef04a451..e2f86a1ce 100644 --- a/Source/astcenc_vecmathlib_avx2_8.h +++ b/Source/astcenc_vecmathlib_avx2_8.h @@ -529,7 +529,7 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p) /** * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector. */ -ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v) +ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint8 v, uint8_t* p) { __m256i shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, @@ -541,7 +541,8 @@ ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v) __m128i b = _mm_unpacklo_epi32(a0, a1); __m256i r = astcenc_mm256_set_m128i(b, b); - return vint8(r); + + store_nbytes(vint8(r), p); } /** @@ -844,14 +845,6 @@ ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond) return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m)); } -/** - * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. - */ -ASTCENC_SIMD_INLINE vfloat8 select_msb(vfloat8 a, vfloat8 b, vmask8 cond) -{ - return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m)); -} - /** * @brief Accumulate lane-wise sums for a vector, folded 4-wide. * diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h index b0187630b..8d94bcdeb 100644 --- a/Source/astcenc_vecmathlib_neon_4.h +++ b/Source/astcenc_vecmathlib_neon_4.h @@ -115,7 +115,7 @@ struct vfloat4 */ static ASTCENC_SIMD_INLINE vfloat4 zero() { - return vfloat4(vdupq_n_f32(0.0f)); + return vfloat4(0.0f); } /** @@ -603,16 +603,17 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p) } /** - * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector. + * @brief Pack and store low 8 bits of each vector lane. */ -ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a) +ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* data) { alignas(16) uint8_t shuf[16] { 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; uint8x16_t idx = vld1q_u8(shuf); int8x16_t av = vreinterpretq_s8_s32(a.m); - return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx))); + a = vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx))); + store_nbytes(a, data); } /** @@ -790,21 +791,12 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond) return vfloat4(vbslq_f32(cond.m, b.m, a.m)); } -/** - * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. - */ -ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond) -{ - static const uint32x4_t msb = vdupq_n_u32(0x80000000u); - uint32x4_t mask = vcgeq_u32(cond.m, msb); - return vfloat4(vbslq_f32(mask, b.m, a.m)); -} - /** * @brief Load a vector of gathered results from an array; */ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices) { +#if ASTCENC_SVE == 0 alignas(16) int idx[4]; storea(indices, idx); alignas(16) float vals[4]; @@ -813,6 +805,11 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices) vals[2] = base[idx[2]]; vals[3] = base[idx[3]]; return vfloat4(vals); +#else + svint32_t offsets = svset_neonq_s32(svundef_s32(), indices.m); + svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets); + return vfloat4(svget_neonq_f32(data)); +#endif } /** diff --git a/Source/astcenc_vecmathlib_none_4.h b/Source/astcenc_vecmathlib_none_4.h index 8e2c57bc2..11c6d62ab 100644 --- a/Source/astcenc_vecmathlib_none_4.h +++ b/Source/astcenc_vecmathlib_none_4.h @@ -679,7 +679,7 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p) /** * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector. */ -ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a) +ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* p) { int b0 = a.m[0] & 0xFF; int b1 = a.m[1] & 0xFF; @@ -687,7 +687,8 @@ ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a) int b3 = a.m[3] & 0xFF; int b = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24); - return vint4(b, 0, 0, 0); + a = vint4(b, 0, 0, 0); + store_nbytes(a, p); } /** @@ -915,17 +916,6 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond) (cond.m[3] & static_cast(0x80000000)) ? b.m[3] : a.m[3]); } -/** - * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. - */ -ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond) -{ - return vfloat4((cond.m[0] & static_cast(0x80000000)) ? b.m[0] : a.m[0], - (cond.m[1] & static_cast(0x80000000)) ? b.m[1] : a.m[1], - (cond.m[2] & static_cast(0x80000000)) ? b.m[2] : a.m[2], - (cond.m[3] & static_cast(0x80000000)) ? b.m[3] : a.m[3]); -} - /** * @brief Load a vector of gathered results from an array; */ diff --git a/Source/astcenc_vecmathlib_sse_4.h b/Source/astcenc_vecmathlib_sse_4.h index 163171cfd..c47d73190 100644 --- a/Source/astcenc_vecmathlib_sse_4.h +++ b/Source/astcenc_vecmathlib_sse_4.h @@ -658,15 +658,17 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p) /** * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector. */ -ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a) +ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* p) { #if ASTCENC_SSE >= 41 __m128i shuf = _mm_set_epi8(0,0,0,0, 0,0,0,0, 0,0,0,0, 12,8,4,0); - return vint4(_mm_shuffle_epi8(a.m, shuf)); + a = vint4(_mm_shuffle_epi8(a.m, shuf)); + store_nbytes(a, p); #else __m128i va = _mm_unpacklo_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(1,1,1,1))); __m128i vb = _mm_unpackhi_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(3,3,3,3))); - return vint4(_mm_unpacklo_epi16(va, vb)); + a = vint4(_mm_unpacklo_epi16(va, vb)); + store_nbytes(a, p); #endif } @@ -877,19 +879,6 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond) #endif } -/** - * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. - */ -ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond) -{ -#if ASTCENC_SSE >= 41 - return vfloat4(_mm_blendv_ps(a.m, b.m, cond.m)); -#else - __m128 d = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(cond.m), 31)); - return vfloat4(_mm_or_ps(_mm_and_ps(d, b.m), _mm_andnot_ps(d, a.m))); -#endif -} - /** * @brief Load a vector of gathered results from an array; */ diff --git a/Source/astcenc_vecmathlib_sve_8.h b/Source/astcenc_vecmathlib_sve_8.h new file mode 100644 index 000000000..85c91bffe --- /dev/null +++ b/Source/astcenc_vecmathlib_sve_8.h @@ -0,0 +1,1089 @@ +// SPDX-License-Identifier: Apache-2.0 +// ---------------------------------------------------------------------------- +// Copyright 2019-2024 Arm Limited +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// ---------------------------------------------------------------------------- + +/** + * @brief 8x32-bit vectors, implemented using SVE. + * + * This module implements 8-wide 32-bit float, int, and mask vectors for Arm + * SVE. + * + * There is a baseline level of functionality provided by all vector widths and + * implementations. This is implemented using identical function signatures, + * modulo data type, so we can use them as substitutable implementations in VLA + * code. + */ + +#ifndef ASTC_VECMATHLIB_sve_8_H_INCLUDED +#define ASTC_VECMATHLIB_sve_8_H_INCLUDED + +#ifndef ASTCENC_SIMD_INLINE + #error "Include astcenc_vecmathlib.h, do not include directly" +#endif + +#include + +typedef svbool_t svbool_8_t __attribute__((arm_sve_vector_bits(256))); +typedef svuint8_t svuint8_8_t __attribute__((arm_sve_vector_bits(256))); +typedef svuint16_t svuint16_8_t __attribute__((arm_sve_vector_bits(256))); +typedef svuint32_t svuint32_8_t __attribute__((arm_sve_vector_bits(256))); +typedef svint32_t svint32_8_t __attribute__((arm_sve_vector_bits(256))); +typedef svfloat32_t svfloat32_8_t __attribute__((arm_sve_vector_bits(256))); + +// ============================================================================ +// vfloat8 data type +// ============================================================================ + +/** + * @brief Data type for 8-wide floats. + */ +struct vfloat8 +{ + /** + * @brief Construct from zero-initialized value. + */ + ASTCENC_SIMD_INLINE vfloat8() = default; + + /** + * @brief Construct from 8 values loaded from an unaligned address. + * + * Consider using loada() which is better with vectors if data is aligned + * to vector length. + */ + ASTCENC_SIMD_INLINE explicit vfloat8(const float *p) + { + m = svld1_f32(svptrue_b32(), p); + } + + /** + * @brief Construct from 1 scalar value replicated across all lanes. + * + * Consider using zero() for constexpr zeros. + */ + ASTCENC_SIMD_INLINE explicit vfloat8(float a) + { + m = svdup_f32(a); + } + + /** + * @brief Construct from an existing SIMD register. + */ + ASTCENC_SIMD_INLINE explicit vfloat8(svfloat32_8_t a) + { + m = a; + } + + /** + * @brief Factory that returns a vector of zeros. + */ + static ASTCENC_SIMD_INLINE vfloat8 zero() + { + return vfloat8(0.0f); + } + + /** + * @brief Factory that returns a replicated scalar loaded from memory. + */ + static ASTCENC_SIMD_INLINE vfloat8 load1(const float* p) + { + return vfloat8(*p); + } + + /** + * @brief Factory that returns a vector loaded from 32B aligned memory. + */ + static ASTCENC_SIMD_INLINE vfloat8 loada(const float* p) + { + return vfloat8(p); + } + + /** + * @brief The vector ... + */ + svfloat32_8_t m; +}; + +// ============================================================================ +// vint8 data type +// ============================================================================ + +/** + * @brief Data type for 8-wide ints. + */ +struct vint8 +{ + /** + * @brief Construct from zero-initialized value. + */ + ASTCENC_SIMD_INLINE vint8() = default; + + /** + * @brief Construct from 8 values loaded from an unaligned address. + * + * Consider using loada() which is better with vectors if data is aligned + * to vector length. + */ + ASTCENC_SIMD_INLINE explicit vint8(const int *p) + { + m = svld1_s32(svptrue_b32(), p); + } + + /** + * @brief Construct from 8 uint8_t loaded from an unaligned address. + */ + ASTCENC_SIMD_INLINE explicit vint8(const uint8_t *p) + { + // Load 8 byte values + svbool_8_t pred = svptrue_pat_b8(SV_VL8); + svuint8_8_t m8 = svld1_u8(pred, p); + + // Expand to 32-bits + svuint16_8_t m16 = svunpklo_u16(m8); + svuint32_8_t m32 = svunpklo_u32(m16); + m = svreinterpret_s32_u32(m32); + } + + /** + * @brief Construct from 1 scalar value replicated across all lanes. + * + * Consider using vfloat8::zero() for constexpr zeros. + */ + ASTCENC_SIMD_INLINE explicit vint8(int a) + { + m = svdup_s32(a); + } + + /** + * @brief Construct from an existing SIMD register. + */ + ASTCENC_SIMD_INLINE explicit vint8(svint32_8_t a) + { + m = a; + } + + /** + * @brief Factory that returns a vector of zeros. + */ + static ASTCENC_SIMD_INLINE vint8 zero() + { + return vint8(0.0f); + } + + /** + * @brief Factory that returns a replicated scalar loaded from memory. + */ + static ASTCENC_SIMD_INLINE vint8 load1(const int* p) + { + return vint8(*p); + } + + /** + * @brief Factory that returns a vector loaded from unaligned memory. + */ + static ASTCENC_SIMD_INLINE vint8 load(const uint8_t* p) + { + svuint8_8_t data = svld1_u8(svptrue_b8(), p); + return vint8(svreinterpret_s32_u8(data)); + } + + /** + * @brief Factory that returns a vector loaded from 32B aligned memory. + */ + static ASTCENC_SIMD_INLINE vint8 loada(const int* p) + { + return vint8(p); + } + + /** + * @brief Factory that returns a vector containing the lane IDs. + */ + static ASTCENC_SIMD_INLINE vint8 lane_id() + { + return vint8(svindex_s32(0, 1)); + } + + /** + * @brief The vector ... + */ + svint32_8_t m; +}; + +// ============================================================================ +// vmask8 data type +// ============================================================================ + +/** + * @brief Data type for 8-wide control plane masks. + */ +struct vmask8 +{ + /** + * @brief Construct from an existing SIMD register. + */ + ASTCENC_SIMD_INLINE explicit vmask8(svbool_8_t a) + { + m = a; + } + + /** + * @brief Construct from 1 scalar value. + */ + ASTCENC_SIMD_INLINE explicit vmask8(bool a) + { + m = svdup_b32(a); + } + + /** + * @brief The vector ... + */ + svbool_8_t m; +}; + +// ============================================================================ +// vmask8 operators and functions +// ============================================================================ + +/** + * @brief Overload: mask union (or). + */ +ASTCENC_SIMD_INLINE vmask8 operator|(vmask8 a, vmask8 b) +{ + return vmask8(svorr_z(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: mask intersect (and). + */ +ASTCENC_SIMD_INLINE vmask8 operator&(vmask8 a, vmask8 b) +{ + return vmask8(svand_z(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: mask difference (xor). + */ +ASTCENC_SIMD_INLINE vmask8 operator^(vmask8 a, vmask8 b) +{ + return vmask8(sveor_z(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: mask invert (not). + */ +ASTCENC_SIMD_INLINE vmask8 operator~(vmask8 a) +{ + return vmask8(svnot_z(svptrue_b32(), a.m)); +} + +/** + * @brief Return a 8-bit mask code indicating mask status. + * + * bit0 = lane 0 + */ +ASTCENC_SIMD_INLINE unsigned int mask(vmask8 a) +{ + alignas(32) const int shifta[8] { 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 }; + svint32_8_t template_vals = svld1_s32(svptrue_b32(), shifta); + svint32_8_t active_vals = svsel_s32(a.m, template_vals, svdup_s32(0)); + return static_cast(svaddv_s32(svptrue_b32(), active_vals)); +} + +/** + * @brief True if any lanes are enabled, false otherwise. + */ +ASTCENC_SIMD_INLINE bool any(vmask8 a) +{ + return svptest_any(svptrue_b32(), a.m); +} + +/** + * @brief True if all lanes are enabled, false otherwise. + */ +ASTCENC_SIMD_INLINE bool all(vmask8 a) +{ + return !svptest_any(svptrue_b32(), (~a).m); +} + +// ============================================================================ +// vint8 operators and functions +// ============================================================================ +/** + * @brief Overload: vector by vector addition. + */ +ASTCENC_SIMD_INLINE vint8 operator+(vint8 a, vint8 b) +{ + return vint8(svadd_s32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector incremental addition. + */ +ASTCENC_SIMD_INLINE vint8& operator+=(vint8& a, const vint8& b) +{ + a = a + b; + return a; +} + +/** + * @brief Overload: vector by vector subtraction. + */ +ASTCENC_SIMD_INLINE vint8 operator-(vint8 a, vint8 b) +{ + return vint8(svsub_s32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector multiplication. + */ +ASTCENC_SIMD_INLINE vint8 operator*(vint8 a, vint8 b) +{ + return vint8(svmul_s32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector bit invert. + */ +ASTCENC_SIMD_INLINE vint8 operator~(vint8 a) +{ + return vint8(svnot_s32_x(svptrue_b32(), a.m)); +} + +/** + * @brief Overload: vector by vector bitwise or. + */ +ASTCENC_SIMD_INLINE vint8 operator|(vint8 a, vint8 b) +{ + return vint8(svorr_s32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector bitwise and. + */ +ASTCENC_SIMD_INLINE vint8 operator&(vint8 a, vint8 b) +{ + return vint8(svand_s32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector bitwise xor. + */ +ASTCENC_SIMD_INLINE vint8 operator^(vint8 a, vint8 b) +{ + return vint8(sveor_s32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector equality. + */ +ASTCENC_SIMD_INLINE vmask8 operator==(vint8 a, vint8 b) +{ + return vmask8(svcmpeq_s32(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector inequality. + */ +ASTCENC_SIMD_INLINE vmask8 operator!=(vint8 a, vint8 b) +{ + return vmask8(svcmpne_s32(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector less than. + */ +ASTCENC_SIMD_INLINE vmask8 operator<(vint8 a, vint8 b) +{ + return vmask8(svcmplt_s32(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector greater than. + */ +ASTCENC_SIMD_INLINE vmask8 operator>(vint8 a, vint8 b) +{ + return vmask8(svcmpgt_s32(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Logical shift left. + */ +template ASTCENC_SIMD_INLINE vint8 lsl(vint8 a) +{ + return vint8(svlsl_n_s32_x(svptrue_b32(), a.m, s)); +} + +/** + * @brief Arithmetic shift right. + */ +template ASTCENC_SIMD_INLINE vint8 asr(vint8 a) +{ + return vint8(svasr_n_s32_x(svptrue_b32(), a.m, s)); +} + +/** + * @brief Logical shift right. + */ +template ASTCENC_SIMD_INLINE vint8 lsr(vint8 a) +{ + svuint32_8_t r = svreinterpret_u32_s32(a.m); + r = svlsr_n_u32_x(svptrue_b32(), r, s); + return vint8(svreinterpret_s32_u32(r)); +} + +/** + * @brief Return the min vector of two vectors. + */ +ASTCENC_SIMD_INLINE vint8 min(vint8 a, vint8 b) +{ + return vint8(svmin_s32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Return the max vector of two vectors. + */ +ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b) +{ + return vint8(svmax_s32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Return the horizontal minimum of a vector. + */ +ASTCENC_SIMD_INLINE vint8 hmin(vint8 a) +{ + return vint8(svminv_s32(svptrue_b32(), a.m)); +} + +/** + * @brief Return the horizontal minimum of a vector. + */ +ASTCENC_SIMD_INLINE int hmin_s(vint8 a) +{ + return svminv_s32(svptrue_b32(), a.m); +} + +/** + * @brief Return the horizontal maximum of a vector. + */ +ASTCENC_SIMD_INLINE vint8 hmax(vint8 a) +{ + return vint8(svmaxv_s32(svptrue_b32(), a.m)); +} + +/** + * @brief Return the horizontal maximum of a vector. + */ +ASTCENC_SIMD_INLINE int hmax_s(vint8 a) +{ + return svmaxv_s32(svptrue_b32(), a.m); +} + +/** + * @brief Store a vector to a 16B aligned memory address. + */ +ASTCENC_SIMD_INLINE void storea(vint8 a, int* p) +{ + svst1_s32(svptrue_b32(), p, a.m); +} + +/** + * @brief Store a vector to an unaligned memory address. + */ +ASTCENC_SIMD_INLINE void store(vint8 a, int* p) +{ + svst1_s32(svptrue_b32(), p, a.m); +} + +/** + * @brief Store lowest N (vector width) bytes into an unaligned address. + */ +ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p) +{ + svuint8_8_t r = svreinterpret_u8_s32(a.m); + svst1_u8(svptrue_pat_b8(SV_VL8), p, r); +} + +/** + * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector. + */ +ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint8 v, uint8_t* p) +{ + svuint32_8_t data = svreinterpret_u32_s32(v.m); + svst1b_u32(svptrue_b32(), p, data); +} + +/** + * @brief Return lanes from @c b if @c cond is set, else @c a. + */ +ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond) +{ + return vint8(svsel_s32(cond.m, b.m, a.m)); +} + +// ============================================================================ +// vfloat8 operators and functions +// ============================================================================ + +/** + * @brief Overload: vector by vector addition. + */ +ASTCENC_SIMD_INLINE vfloat8 operator+(vfloat8 a, vfloat8 b) +{ + return vfloat8(svadd_f32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector incremental addition. + */ +ASTCENC_SIMD_INLINE vfloat8& operator+=(vfloat8& a, const vfloat8& b) +{ + a = a + b; + return a; +} + +/** + * @brief Overload: vector by vector subtraction. + */ +ASTCENC_SIMD_INLINE vfloat8 operator-(vfloat8 a, vfloat8 b) +{ + return vfloat8(svsub_f32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector multiplication. + */ +ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, vfloat8 b) +{ + return vfloat8(svmul_f32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by scalar multiplication. + */ +ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, float b) +{ + return vfloat8(svmul_f32_x(svptrue_b32(), a.m, svdup_f32(b))); +} + +/** + * @brief Overload: scalar by vector multiplication. + */ +ASTCENC_SIMD_INLINE vfloat8 operator*(float a, vfloat8 b) +{ + return vfloat8(svmul_f32_x(svptrue_b32(), svdup_f32(a), b.m)); +} + +/** + * @brief Overload: vector by vector division. + */ +ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, vfloat8 b) +{ + return vfloat8(svdiv_f32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by scalar division. + */ +ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b) +{ + return vfloat8(svdiv_f32_x(svptrue_b32(), a.m, svdup_f32(b))); +} + + +/** + * @brief Overload: scalar by vector division. + */ +ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b) +{ + return vfloat8(svdiv_f32_x(svptrue_b32(), svdup_f32(a), b.m)); +} + + +/** + * @brief Overload: vector by vector equality. + */ +ASTCENC_SIMD_INLINE vmask8 operator==(vfloat8 a, vfloat8 b) +{ + return vmask8(svcmpeq_f32(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector inequality. + */ +ASTCENC_SIMD_INLINE vmask8 operator!=(vfloat8 a, vfloat8 b) +{ + return vmask8(svcmpne_f32(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector less than. + */ +ASTCENC_SIMD_INLINE vmask8 operator<(vfloat8 a, vfloat8 b) +{ + return vmask8(svcmplt_f32(svptrue_b32(), a.m, b.m));; +} + +/** + * @brief Overload: vector by vector greater than. + */ +ASTCENC_SIMD_INLINE vmask8 operator>(vfloat8 a, vfloat8 b) +{ + return vmask8(svcmpgt_f32(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector less than or equal. + */ +ASTCENC_SIMD_INLINE vmask8 operator<=(vfloat8 a, vfloat8 b) +{ + return vmask8(svcmple_f32(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector greater than or equal. + */ +ASTCENC_SIMD_INLINE vmask8 operator>=(vfloat8 a, vfloat8 b) +{ + return vmask8(svcmpge_f32(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Return the min vector of two vectors. + * + * If either lane value is NaN, the other lane will be returned. + */ +ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, vfloat8 b) +{ + return vfloat8(svminnm_f32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Return the min vector of a vector and a scalar. + * + * If either lane value is NaN, the other lane will be returned. + */ +ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, float b) +{ + return min(a, vfloat8(b)); +} + +/** + * @brief Return the max vector of two vectors. + * + * If either lane value is NaN, the other lane will be returned. + */ +ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, vfloat8 b) +{ + return vfloat8(svmaxnm_f32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Return the max vector of a vector and a scalar. + * + * If either lane value is NaN, the other lane will be returned. + */ +ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, float b) +{ + return max(a, vfloat8(b)); +} + +/** + * @brief Return the clamped value between min and max. + * + * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN + * then @c min will be returned for that lane. + */ +ASTCENC_SIMD_INLINE vfloat8 clamp(float minv, float maxv, vfloat8 a) +{ + return min(max(a, minv), maxv); +} + +/** + * @brief Return a clamped value between 0.0f and 1.0f. + * + * If @c a is NaN then zero will be returned for that lane. + */ +ASTCENC_SIMD_INLINE vfloat8 clampzo(vfloat8 a) +{ + return clamp(0.0f, 1.0f, a); +} + +/** + * @brief Return the absolute value of the float vector. + */ +ASTCENC_SIMD_INLINE vfloat8 abs(vfloat8 a) +{ + return vfloat8(svabs_f32_x(svptrue_b32(), a.m)); +} + +/** + * @brief Return a float rounded to the nearest integer value. + */ +ASTCENC_SIMD_INLINE vfloat8 round(vfloat8 a) +{ + return vfloat8(svrintn_f32_x(svptrue_b32(), a.m)); +} + +/** + * @brief Return the horizontal minimum of a vector. + */ +ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a) +{ + return vfloat8(svminnmv_f32(svptrue_b32(), a.m)); +} + +/** + * @brief Return the horizontal minimum of a vector. + */ +ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a) +{ + return svminnmv_f32(svptrue_b32(), a.m); +} + +/** + * @brief Return the horizontal maximum of a vector. + */ +ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a) +{ + return vfloat8(svmaxnmv_f32(svptrue_b32(), a.m)); +} + +/** + * @brief Return the horizontal maximum of a vector. + */ +ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a) +{ + return svmaxnmv_f32(svptrue_b32(), a.m); +} + +/** + * @brief Return the horizontal sum of a vector. + */ +ASTCENC_SIMD_INLINE float hadd_s(vfloat8 a) +{ + // Can't use svaddv - it's not invariant + vfloat4 lo(svget_neonq_f32(a.m)); + vfloat4 hi(svget_neonq_f32(svext_f32(a.m, a.m, 4))); + return hadd_s(lo) + hadd_s(hi); +} + +/** + * @brief Return lanes from @c b if @c cond is set, else @c a. + */ +ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond) +{ + return vfloat8(svsel_f32(cond.m, b.m, a.m)); +} + +/** + * @brief Accumulate lane-wise sums for a vector, folded 4-wide. + * + * This is invariant with 4-wide implementations. + */ +ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a) +{ + vfloat4 lo(svget_neonq_f32(a.m)); + haccumulate(accum, lo); + + vfloat4 hi(svget_neonq_f32(svext_f32(a.m, a.m, 4))); + haccumulate(accum, hi); +} + +/** + * @brief Accumulate lane-wise sums for a vector. + * + * This is NOT invariant with 4-wide implementations. + */ +ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a) +{ + accum += a; +} + +/** + * @brief Accumulate masked lane-wise sums for a vector, folded 4-wide. + * + * This is invariant with 4-wide implementations. + */ +ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a, vmask8 m) +{ + a = select(vfloat8::zero(), a, m); + haccumulate(accum, a); +} + +/** + * @brief Accumulate masked lane-wise sums for a vector. + * + * This is NOT invariant with 4-wide implementations. + */ +ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a, vmask8 m) +{ + accum.m = svadd_f32_m(m.m, accum.m, a.m); +} + +/** + * @brief Return the sqrt of the lanes in the vector. + */ +ASTCENC_SIMD_INLINE vfloat8 sqrt(vfloat8 a) +{ + return vfloat8(svsqrt_f32_x(svptrue_b32(), a.m)); +} + +/** + * @brief Load a vector of gathered results from an array; + */ +ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices) +{ + return vfloat8(svld1_gather_s32index_f32(svptrue_b32(), base, indices.m)); +} + +/** + * @brief Store a vector to an unaligned memory address. + */ +ASTCENC_SIMD_INLINE void store(vfloat8 a, float* p) +{ + svst1_f32(svptrue_b32(), p, a.m); +} + +/** + * @brief Store a vector to a 32B aligned memory address. + */ +ASTCENC_SIMD_INLINE void storea(vfloat8 a, float* p) +{ + svst1_f32(svptrue_b32(), p, a.m); +} + +/** + * @brief Return a integer value for a float vector, using truncation. + */ +ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a) +{ + return vint8(svcvt_s32_f32_x(svptrue_b32(), a.m)); +} + +/** + * @brief Return a integer value for a float vector, using round-to-nearest. + */ +ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a) +{ + a = a + vfloat8(0.5f); + return vint8(svcvt_s32_f32_x(svptrue_b32(), a.m)); +} + +/** + * @brief Return a float value for an integer vector. + */ +ASTCENC_SIMD_INLINE vfloat8 int_to_float(vint8 a) +{ + return vfloat8(svcvt_f32_s32_x(svptrue_b32(), a.m)); +} + +/** + * @brief Return a float value as an integer bit pattern (i.e. no conversion). + * + * It is a common trick to convert floats into integer bit patterns, perform + * some bit hackery based on knowledge they are IEEE 754 layout, and then + * convert them back again. This is the first half of that flip. + */ +ASTCENC_SIMD_INLINE vint8 float_as_int(vfloat8 a) +{ + return vint8(svreinterpret_s32_f32(a.m)); +} + +/** + * @brief Return a integer value as a float bit pattern (i.e. no conversion). + * + * It is a common trick to convert floats into integer bit patterns, perform + * some bit hackery based on knowledge they are IEEE 754 layout, and then + * convert them back again. This is the second half of that flip. + */ +ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a) +{ + return vfloat8(svreinterpret_f32_s32(a.m)); +} + +/** + * @brief Prepare a vtable lookup table for use with the native SIMD size. + */ +ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p) +{ + t0p = vint8(svdup_neonq_f32(t0.m)); +} + +/** + * @brief Prepare a vtable lookup table for use with the native SIMD size. + */ +ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p) +{ + // 8-wide SVE uses a single table register, so t1 is unused + (void)t1p; + + svfloat32_8_t t0v = svdup_neonq_f32(t0.m); + svfloat32_8_t t1v = svdup_neonq_f32(t1.m); + + t0p = vint8(svext_f32(t0v, t1v, 4)); +} + +/** + * @brief Prepare a vtable lookup table for use with the native SIMD size. + */ +ASTCENC_SIMD_INLINE void vtable_prepare( + vint4 t0, vint4 t1, vint4 t2, vint4 t3, + vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p) +{ + // 8-wide SVE uses a two table registers, so t2 and t3 are unused + (void)t2p; + (void)t3p; + + svfloat32_8_t t0v = svdup_neonq_f32(t0.m); + svfloat32_8_t t1v = svdup_neonq_f32(t1.m); + svfloat32_8_t t2v = svdup_neonq_f32(t2.m); + svfloat32_8_t t3v = svdup_neonq_f32(t3.m); + + t0p = vint8(svext_f32(t0v, t1v, 4)); + t1p = vint8(svext_f32(t2v, t3v, 4)); +} + +/** + * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes. + */ +ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx) +{ + // Set index byte above max index for unused bytes so table lookup returns zero + svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00)); + + svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked); + svuint8_8_t tbl_bytes = svreinterpret_u8_s32(t0.m); + svuint8_8_t result = svtbl_u8(tbl_bytes, idx_bytes); + + return vint8(svreinterpret_s32_u8(result)); +} + +/** + * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. + */ +ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx) +{ + // 8-wide SVE uses a single table register, so t1 is unused + (void)t1; + + // Set index byte above max index for unused bytes so table lookup returns zero + svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00)); + + svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked); + svuint8_8_t tbl_bytes = svreinterpret_u8_s32(t0.m); + svuint8_8_t result = svtbl_u8(tbl_bytes, idx_bytes); + + return vint8(svreinterpret_s32_u8(result)); +} + +/** + * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes. + */ +ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx) +{ + // 8-wide SVE uses a two table registers, so t2 and t3 are unused + (void)t2; + (void)t3; + + // Set index byte above max index for unused bytes so table lookup returns zero + svint32_8_t literal32 = svdup_s32(32); + svbool_8_t idx_lo_select = svcmplt(svptrue_b32(), idx.m, literal32); + svint32_8_t idx_lo_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00)); + svint32_8_t idx_hi_masked = svorr_s32_x(svptrue_b32(), idx.m - literal32, svdup_s32(0xFFFFFF00)); + + svuint8_8_t idx_lo_bytes = svreinterpret_u8_s32(idx_lo_masked); + svuint8_8_t idx_hi_bytes = svreinterpret_u8_s32(idx_hi_masked); + + svuint8_8_t tbl0_bytes = svreinterpret_u8_s32(t0.m); + svuint8_8_t tbl1_bytes = svreinterpret_u8_s32(t1.m); + + svint32_8_t t0_lookup = svreinterpret_s32_u8(svtbl_u8(tbl0_bytes, idx_lo_bytes)); + svint32_8_t t1_lookup = svreinterpret_s32_u8(svtbl_u8(tbl1_bytes, idx_hi_bytes)); + + svint32_8_t result = svsel_s32(idx_lo_select, t0_lookup, t1_lookup); + + // Future: SVE2 can directly do svtbl2_u8() for a two register table + return vint8(result); +} + +/** + * @brief Return a vector of interleaved RGBA data. + * + * Input vectors have the value stored in the bottom 8 bits of each lane, + * with high bits set to zero. + * + * Output vector stores a single RGBA texel packed in each lane. + */ +ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a) +{ + return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a); +} + +/** + * @brief Store a vector, skipping masked lanes. + * + * All masked lanes must be at the end of vector, after all non-masked lanes. + */ +ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 mask) +{ + svst1_u32(mask.m, reinterpret_cast(base), data.m); +} + +/** + * @brief Debug function to print a vector of ints. + */ +ASTCENC_SIMD_INLINE void print(vint8 a) +{ + alignas(32) int v[8]; + storea(a, v); + printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n", + v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); +} + +/** + * @brief Debug function to print a vector of ints. + */ +ASTCENC_SIMD_INLINE void printx(vint8 a) +{ + alignas(32) int v[8]; + storea(a, v); + printf("v8_i32:\n %08x %08x %08x %08x %08x %08x %08x %08x\n", + v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); +} + +/** + * @brief Debug function to print a vector of floats. + */ +ASTCENC_SIMD_INLINE void print(vfloat8 a) +{ + alignas(32) float v[8]; + storea(a, v); + printf("v8_f32:\n %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n", + static_cast(v[0]), static_cast(v[1]), + static_cast(v[2]), static_cast(v[3]), + static_cast(v[4]), static_cast(v[5]), + static_cast(v[6]), static_cast(v[7])); +} + +/** + * @brief Debug function to print a vector of masks. + */ +ASTCENC_SIMD_INLINE void print(vmask8 a) +{ + print(select(vint8(0), vint8(1), a)); +} + +#endif // #ifndef ASTC_VECMATHLIB_sve_8_H_INCLUDED diff --git a/Source/astcenccli_entry.cpp b/Source/astcenccli_entry.cpp index 3c56fc225..ab0b95923 100644 --- a/Source/astcenccli_entry.cpp +++ b/Source/astcenccli_entry.cpp @@ -37,6 +37,7 @@ int astcenc_main( int argc, char **argv); +// x86-64 builds #if (ASTCENC_SSE > 20) || (ASTCENC_AVX > 0) || \ (ASTCENC_POPCNT > 0) || (ASTCENC_F16C > 0) @@ -254,6 +255,36 @@ static bool validate_cpu_isa() return true; } +// Validate Arm SVE availability +#elif ASTCENC_SVE != 0 + +#include +static bool cpu_supports_sve_256() +{ + long hwcaps = getauxval(AT_HWCAP); + return (hwcaps & HWCAP_SVE) != 0; +} + +/** + * @brief Print a string to stderr. + */ +static inline void print_error( + const char* format +) { + fprintf(stderr, "%s", format); +} + +static bool validate_cpu_isa() +{ + if (!cpu_supports_sve_256()) + { + print_error("ERROR: Host does not support SVE ISA extension\n"); + return false; + } + + return true; +} + #else // Fallback for cases with no dynamic ISA availability diff --git a/Source/astcenccli_image.cpp b/Source/astcenccli_image.cpp index 4b1bb637a..237da60c7 100644 --- a/Source/astcenccli_image.cpp +++ b/Source/astcenccli_image.cpp @@ -350,8 +350,7 @@ uint8_t* unorm8x4_array_from_astc_img( color = clamp(0.0f, 1.0f, color) * 255.0f; colori = float_to_int_rtn(color); - pack_low_bytes(colori); - store_nbytes(colori, dst + 4 * x); + pack_and_store_low_bytes(colori, dst + 4 * x); } } } diff --git a/Source/astcenccli_toplevel.cpp b/Source/astcenccli_toplevel.cpp index 39eb586af..dd0941b65 100644 --- a/Source/astcenccli_toplevel.cpp +++ b/Source/astcenccli_toplevel.cpp @@ -1382,8 +1382,7 @@ static void image_set_pixel_u8( assert(img.data_type == ASTCENC_TYPE_U8); uint8_t* data = static_cast(img.data[0]); - pixel = pack_low_bytes(pixel); - store_nbytes(pixel, data + (4 * img.dim_x * y) + (4 * x )); + pack_and_store_low_bytes(pixel, data + (4 * img.dim_x * y) + (4 * x)); } /** @@ -1886,6 +1885,18 @@ int astcenc_main( int argc, char **argv ) { +#if ASTCENC_SVE != 0 + // Do this check here because is needs SVE instructions so cannot be in + // the veneer check which is compiled as stock Armv8. We know we have SVE + // by the time we get this far, but not the vector width. + if (svcntw() != ASTCENC_SVE) + { + uint32_t bits = ASTCENC_SVE * 32; + print_error("ERROR: Host does not implement %u bit SVE ISA extension\n", bits); + return false; + } +#endif + double start_time = get_time(); if (argc < 2) diff --git a/Source/astcenccli_toplevel_help.cpp b/Source/astcenccli_toplevel_help.cpp index 71b9a42d0..b01da6bd5 100644 --- a/Source/astcenccli_toplevel_help.cpp +++ b/Source/astcenccli_toplevel_help.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2023 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -560,6 +560,8 @@ void astcenc_print_header() const char* simdtype = "sse4.1"; #elif (ASTCENC_SSE == 20) const char* simdtype = "sse2"; +#elif (ASTCENC_SVE == 8) + const char* simdtype = "sve.256b"; #elif (ASTCENC_NEON == 1) const char* simdtype = "neon"; #else diff --git a/Source/cmake_core.cmake b/Source/cmake_core.cmake index e78eb70bb..172b89502 100644 --- a/Source/cmake_core.cmake +++ b/Source/cmake_core.cmake @@ -271,6 +271,7 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_IS_VENEER) target_compile_definitions(${ASTCENC_TARGET_NAME} PRIVATE ASTCENC_NEON=0 + ASTCENC_SVE=0 ASTCENC_SSE=0 ASTCENC_AVX=0 ASTCENC_POPCNT=0 @@ -280,6 +281,7 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_IS_VENEER) target_compile_definitions(${ASTCENC_TARGET_NAME} PRIVATE ASTCENC_NEON=1 + ASTCENC_SVE=0 ASTCENC_SSE=0 ASTCENC_AVX=0 ASTCENC_POPCNT=0 @@ -293,10 +295,28 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_IS_VENEER) $<${is_msvccl}:/d2ssa-cfg-sink->) endif() + elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_256") + target_compile_definitions(${ASTCENC_TARGET_NAME} + PRIVATE + ASTCENC_NEON=1 + ASTCENC_SVE=8 + ASTCENC_SSE=0 + ASTCENC_AVX=0 + ASTCENC_POPCNT=0 + ASTCENC_F16C=0) + + # Enable SVE + if (NOT ${ASTCENC_IS_VENEER}) + target_compile_options(${ASTCENC_TARGET_NAME} + PRIVATE + -march=armv8-a+sve -msve-vector-bits=256) + endif() + elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2") target_compile_definitions(${ASTCENC_TARGET_NAME} PRIVATE ASTCENC_NEON=0 + ASTCENC_SVE=0 ASTCENC_SSE=20 ASTCENC_AVX=0 ASTCENC_POPCNT=0 @@ -314,6 +334,7 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_IS_VENEER) target_compile_definitions(${ASTCENC_TARGET_NAME} PRIVATE ASTCENC_NEON=0 + ASTCENC_SVE=0 ASTCENC_SSE=41 ASTCENC_AVX=0 ASTCENC_POPCNT=1 @@ -338,6 +359,7 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_IS_VENEER) target_compile_definitions(${ASTCENC_TARGET_NAME} PRIVATE ASTCENC_NEON=0 + ASTCENC_SVE=0 ASTCENC_SSE=41 ASTCENC_AVX=2 ASTCENC_POPCNT=1 diff --git a/Test/astc_test_image.py b/Test/astc_test_image.py index 141b7015c..df4e51115 100644 --- a/Test/astc_test_image.py +++ b/Test/astc_test_image.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: Apache-2.0 # ----------------------------------------------------------------------------- -# Copyright 2019-2023 Arm Limited +# Copyright 2019-2024 Arm Limited # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy @@ -289,13 +289,12 @@ def parse_command_line(): refcoders = ["ref-1.7", "ref-2.5-neon", "ref-2.5-sse2", "ref-2.5-sse4.1", "ref-2.5-avx2", "ref-3.7-neon", "ref-3.7-sse2", "ref-3.7-sse4.1", "ref-3.7-avx2", - "ref-4.4-neon", "ref-4.4-sse2", "ref-4.4-sse4.1", "ref-4.4-avx2", "ref-4.5-neon", "ref-4.5-sse2", "ref-4.5-sse4.1", "ref-4.5-avx2", - "ref-main-neon", "ref-main-sse2", "ref-main-sse4.1", "ref-main-avx2"] + "ref-main-neon", "ref-main-sve_256", "ref-main-sse2", "ref-main-sse4.1", "ref-main-avx2"] # All test encoders - testcoders = ["none", "neon", "sse2", "sse4.1", "avx2", "native", "universal"] - testcodersAArch64 = ["neon"] + testcoders = ["none", "neon", "sve_256", "sse2", "sse4.1", "avx2", "native", "universal"] + testcodersAArch64 = ["neon", "sve_256"] testcodersX86 = ["sse2", "sse4.1", "avx2"] coders = refcoders + testcoders + ["all-aarch64", "all-x86"] diff --git a/Test/astc_update_ref.sh b/Test/astc_update_ref.sh index bf6c32006..827d089e8 100755 --- a/Test/astc_update_ref.sh +++ b/Test/astc_update_ref.sh @@ -17,6 +17,6 @@ echo "" TARGET_ROOT=${1} -python3 ./Test/astc_test_image.py --test-set all --block-size all --test-quality all --repeats 6 --encoder ref-$1-avx2 +python3 ./Test/astc_test_image.py --test-set all --block-size all --test-quality all --repeats 6 --encoder ref-$1-neon #python3 ./Test/astc_test_image.py --test-set all --block-size all --test-quality all --repeats 6 --encoder ref-$1-sse4.1 #python3 ./Test/astc_test_image.py --test-set all --block-size all --test-quality all --repeats 6 --encoder ref-$1-sse2