From 5af72968024188248339d18cda9f8f31741f8c28 Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Fri, 17 May 2024 11:43:46 +0100
Subject: [PATCH] Add Arm fixed-width 256b SVE vector support.

---
 CMakeLists.txt                                |   37 +-
 Source/CMakeLists.txt                         |    8 +-
 Source/UnitTest/CMakeLists.txt                |    8 +-
 Source/UnitTest/cmake_core.cmake              |   22 +-
 Source/UnitTest/test_simd.cpp                 |  104 +-
 .../astcenc_ideal_endpoints_and_weights.cpp   |    6 +-
 Source/astcenc_mathlib.h                      |    8 +-
 Source/astcenc_pick_best_endpoint_format.cpp  |    1 -
 Source/astcenc_vecmathlib.h                   |   53 +-
 Source/astcenc_vecmathlib_avx2_8.h            |   13 +-
 Source/astcenc_vecmathlib_neon_4.h            |   25 +-
 Source/astcenc_vecmathlib_none_4.h            |   16 +-
 Source/astcenc_vecmathlib_sse_4.h             |   21 +-
 Source/astcenc_vecmathlib_sve_8.h             | 1089 +++++++++++++++++
 Source/astcenccli_entry.cpp                   |   31 +
 Source/astcenccli_image.cpp                   |    3 +-
 Source/astcenccli_toplevel.cpp                |   15 +-
 Source/astcenccli_toplevel_help.cpp           |    4 +-
 Source/cmake_core.cmake                       |   22 +
 Test/astc_test_image.py                       |    9 +-
 Test/astc_update_ref.sh                       |    2 +-
 21 files changed, 1317 insertions(+), 180 deletions(-)
 create mode 100644 Source/astcenc_vecmathlib_sve_8.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2f633b400..2443c6aaf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,6 +38,7 @@ include(CTest)
 option(ASTCENC_ISA_AVX2 "Enable astcenc builds for AVX2 SIMD")
 option(ASTCENC_ISA_SSE41 "Enable astcenc builds for SSE4.1 SIMD")
 option(ASTCENC_ISA_SSE2 "Enable astcenc builds for SSE2 SIMD")
+option(ASTCENC_ISA_SVE_256 "Enable astcenc builds for 256-bit SVE SIMD")
 option(ASTCENC_ISA_NEON "Enable astcenc builds for NEON SIMD")
 option(ASTCENC_ISA_NONE "Enable astcenc builds for no SIMD")
 option(ASTCENC_ISA_NATIVE "Enable astcenc builds for native SIMD")
@@ -86,7 +87,7 @@ endforeach()
 
 # Count options which MUST be arm64
 set(ASTCENC_ARM64_ISA_COUNT 0)
-set(ASTCENC_CONFIGS ${ASTCENC_ISA_NEON})
+set(ASTCENC_CONFIGS ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_SVE_256})
 foreach(ASTCENC_CONFIG ${ASTCENC_CONFIGS})
     if(${ASTCENC_CONFIG})
         math(EXPR ASTCENC_ARM64_ISA_COUNT "${ASTCENC_ARM64_ISA_COUNT} + 1")
@@ -117,22 +118,28 @@ if("${ASTCENC_BLOCK_MAX_TEXELS}")
      message(STATUS "  Max block texels - ${ASTCENC_BLOCK_MAX_TEXELS}")
 endif()
 
-printopt("AVX2 backend   " ${ASTCENC_ISA_AVX2})
-printopt("SSE4.1 backend " ${ASTCENC_ISA_SSE41})
-printopt("SSE2 backend   " ${ASTCENC_ISA_SSE2})
-printopt("NEON backend   " ${ASTCENC_ISA_NEON})
-printopt("NONE backend   " ${ASTCENC_ISA_NONE})
-printopt("NATIVE backend " ${ASTCENC_ISA_NATIVE})
+message(STATUS "Arm backend options")
+printopt("SVE 256b backend   " ${ASTCENC_ISA_SVE_256})
+printopt("NEON backend       " ${ASTCENC_ISA_NEON})
+message(STATUS "x86-64 backend options")
+printopt("AVX2 backend       " ${ASTCENC_ISA_AVX2})
+printopt("SSE4.1 backend     " ${ASTCENC_ISA_SSE41})
+printopt("SSE2 backend       " ${ASTCENC_ISA_SSE2})
+message(STATUS "Agnostic backend options")
+printopt("NONE backend       " ${ASTCENC_ISA_NONE})
+printopt("NATIVE backend     " ${ASTCENC_ISA_NATIVE})
+message(STATUS "Build options")
 if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
-    printopt("Universal bin  " ${ASTCENC_UNIVERSAL_BUILD})
+    printopt("Universal bin      " ${ASTCENC_UNIVERSAL_BUILD})
 endif()
-printopt("Invariance     " ${ASTCENC_INVARIANCE})
-printopt("Shared libs    " ${ASTCENC_SHAREDLIB})
-printopt("Decompressor   " ${ASTCENC_DECOMPRESSOR})
-printopt("Diagnostics    " ${ASTCENC_DIAGNOSTICS})
-printopt("ASAN           " ${ASTCENC_ASAN})
-printopt("UBSAN          " ${ASTCENC_UBSAN})
-printopt("Unit tests     " ${ASTCENC_UNITTEST})
+printopt("Invariance         " ${ASTCENC_INVARIANCE})
+printopt("Shared libs        " ${ASTCENC_SHAREDLIB})
+printopt("Decompressor       " ${ASTCENC_DECOMPRESSOR})
+message(STATUS "Developer options")
+printopt("Diagnostics        " ${ASTCENC_DIAGNOSTICS})
+printopt("ASAN               " ${ASTCENC_ASAN})
+printopt("UBSAN              " ${ASTCENC_UBSAN})
+printopt("Unit tests         " ${ASTCENC_UNITTEST})
 
 # Subcomponents
 add_subdirectory(Source)
diff --git a/Source/CMakeLists.txt b/Source/CMakeLists.txt
index cd00c377c..dd04b23f4 100644
--- a/Source/CMakeLists.txt
+++ b/Source/CMakeLists.txt
@@ -27,8 +27,8 @@ else()
     set(ASTCENC_CODEC enc)
 endif()
 
-set(ASTCENC_ARTIFACTS native none neon avx2 sse4.1 sse2)
-set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
+set(ASTCENC_ARTIFACTS native none sve_256 neon avx2 sse4.1 sse2)
+set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_SVE_256} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
 list(LENGTH ASTCENC_ARTIFACTS ASTCENC_ARTIFACTS_LEN)
 math(EXPR ASTCENC_ARTIFACTS_LEN "${ASTCENC_ARTIFACTS_LEN} - 1")
 
@@ -38,7 +38,9 @@ foreach(INDEX RANGE ${ASTCENC_ARTIFACTS_LEN})
     if(${ASTCENC_CONFIG})
         set(ASTCENC_ISA_SIMD ${ASTCENC_ARTIFACT})
 
-        if(${ASTCENC_ISA_SIMD} MATCHES "neon")
+        if(${ASTCENC_ISA_SIMD} MATCHES "sve_256")
+           # Not suported on macOS
+        elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
            set(CMAKE_OSX_ARCHITECTURES arm64)
         elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
            set(CMAKE_OSX_ARCHITECTURES x86_64)
diff --git a/Source/UnitTest/CMakeLists.txt b/Source/UnitTest/CMakeLists.txt
index 2f633c07a..dc4bacad6 100644
--- a/Source/UnitTest/CMakeLists.txt
+++ b/Source/UnitTest/CMakeLists.txt
@@ -15,8 +15,8 @@
 #  under the License.
 #  ----------------------------------------------------------------------------
 
-set(ASTCENC_ARTIFACTS native none neon avx2 sse4.1 sse2)
-set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
+set(ASTCENC_ARTIFACTS native none sve_256 neon avx2 sse4.1 sse2)
+set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_SVE_256} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
 list(LENGTH ASTCENC_ARTIFACTS ASTCENC_ARTIFACTS_LEN)
 math(EXPR ASTCENC_ARTIFACTS_LEN "${ASTCENC_ARTIFACTS_LEN} - 1")
 
@@ -26,7 +26,9 @@ foreach(INDEX RANGE ${ASTCENC_ARTIFACTS_LEN})
     if(${ASTCENC_CONFIG})
         set(ASTCENC_ISA_SIMD ${ASTCENC_ARTIFACT})
 
-        if(${ASTCENC_ISA_SIMD} MATCHES "neon")
+        if(${ASTCENC_ISA_SIMD} MATCHES "sve_256")
+           # Not supported on macOS
+        elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
            set(CMAKE_OSX_ARCHITECTURES arm64)
         elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
            set(CMAKE_OSX_ARCHITECTURES x86_64)
diff --git a/Source/UnitTest/cmake_core.cmake b/Source/UnitTest/cmake_core.cmake
index 2fb7a4d70..2fd20979c 100644
--- a/Source/UnitTest/cmake_core.cmake
+++ b/Source/UnitTest/cmake_core.cmake
@@ -1,6 +1,6 @@
 #  SPDX-License-Identifier: Apache-2.0
 #  ----------------------------------------------------------------------------
-#  Copyright 2020-2023 Arm Limited
+#  Copyright 2020-2024 Arm Limited
 #
 #  Licensed under the Apache License, Version 2.0 (the "License"); you may not
 #  use this file except in compliance with the License. You may obtain a copy
@@ -72,6 +72,7 @@ if(${ASTCENC_ISA_SIMD} MATCHES "none")
     target_compile_definitions(${ASTCENC_TEST}
         PRIVATE
             ASTCENC_NEON=0
+            ASTCENC_SVE=0
             ASTCENC_SSE=0
             ASTCENC_AVX=0
             ASTCENC_POPCNT=0
@@ -81,15 +82,32 @@ elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
     target_compile_definitions(${ASTCENC_TEST}
         PRIVATE
             ASTCENC_NEON=1
+            ASTCENC_SVE=0
             ASTCENC_SSE=0
             ASTCENC_AVX=0
             ASTCENC_POPCNT=0
             ASTCENC_F16C=0)
 
+elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_256")
+    target_compile_definitions(${ASTCENC_TEST}
+        PRIVATE
+            ASTCENC_NEON=1
+            ASTCENC_SVE=8
+            ASTCENC_SSE=0
+            ASTCENC_AVX=0
+            ASTCENC_POPCNT=0
+            ASTCENC_F16C=0)
+
+    # Enable SVE
+    target_compile_options(${ASTCENC_TEST}
+        PRIVATE
+            -march=armv8-a+sve -msve-vector-bits=256)
+
 elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
     target_compile_definitions(${ASTCENC_TEST}
         PRIVATE
             ASTCENC_NEON=0
+            ASTCENC_SVE=0
             ASTCENC_SSE=20
             ASTCENC_AVX=0
             ASTCENC_POPCNT=0
@@ -103,6 +121,7 @@ elseif(${ASTCENC_ISA_SIMD} MATCHES "sse4.1")
     target_compile_definitions(${ASTCENC_TEST}
         PRIVATE
             ASTCENC_NEON=0
+            ASTCENC_SVE=0
             ASTCENC_SSE=41
             ASTCENC_AVX=0
             ASTCENC_POPCNT=1
@@ -116,6 +135,7 @@ elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
     target_compile_definitions(${ASTCENC_TEST}
         PRIVATE
             ASTCENC_NEON=0
+            ASTCENC_SVE=0
             ASTCENC_SSE=41
             ASTCENC_AVX=2
             ASTCENC_POPCNT=1
diff --git a/Source/UnitTest/test_simd.cpp b/Source/UnitTest/test_simd.cpp
index f86403d0f..f857c3550 100644
--- a/Source/UnitTest/test_simd.cpp
+++ b/Source/UnitTest/test_simd.cpp
@@ -194,6 +194,7 @@ TEST(vfloat, ChangeSign)
 	vfloat4 a(-1.0f,  1.0f, -3.12f, 3.12f);
 	vfloat4 b(-1.0f, -1.0f,  3.12f, 3.12f);
 	vfloat4 r = change_sign(a, b);
+
 	EXPECT_EQ(r.lane<0>(),  1.0f);
 	EXPECT_EQ(r.lane<1>(), -1.0f);
 	EXPECT_EQ(r.lane<2>(), -3.12f);
@@ -205,6 +206,7 @@ TEST(vfloat, Atan)
 {
 	vfloat4 a(-0.15f, 0.0f, 0.9f, 2.1f);
 	vfloat4 r = atan(a);
+
 	EXPECT_NEAR(r.lane<0>(), -0.149061f, 0.005f);
 	EXPECT_NEAR(r.lane<1>(),  0.000000f, 0.005f);
 	EXPECT_NEAR(r.lane<2>(),  0.733616f, 0.005f);
@@ -217,6 +219,7 @@ TEST(vfloat, Atan2)
 	vfloat4 a(-0.15f, 0.0f, 0.9f, 2.1f);
 	vfloat4 b(1.15f, -3.0f, -0.9f, 1.1f);
 	vfloat4 r = atan2(a, b);
+
 	EXPECT_NEAR(r.lane<0>(), -0.129816f, 0.005f);
 	EXPECT_NEAR(r.lane<1>(),  3.141592f, 0.005f);
 	EXPECT_NEAR(r.lane<2>(),  2.360342f, 0.005f);
@@ -909,31 +912,6 @@ TEST(vfloat4, select)
 	EXPECT_EQ(r2.lane<3>(), 4.0f);
 }
 
-/** @brief Test vfloat4 select MSB only. */
-TEST(vfloat4, select_msb)
-{
-	int msb_set = static_cast<int>(0x80000000);
-	vint4 msb(msb_set, 0, msb_set, 0);
-	vmask4 cond(msb.m);
-
-	vfloat4 a(1.0f, 3.0f, 3.0f, 1.0f);
-	vfloat4 b(4.0f, 2.0f, 2.0f, 4.0f);
-
-	// Select in one direction
-	vfloat4 r1 = select_msb(a, b, cond);
-	EXPECT_EQ(r1.lane<0>(), 4.0f);
-	EXPECT_EQ(r1.lane<1>(), 3.0f);
-	EXPECT_EQ(r1.lane<2>(), 2.0f);
-	EXPECT_EQ(r1.lane<3>(), 1.0f);
-
-	// Select in the other
-	vfloat4 r2 = select_msb(b, a, cond);
-	EXPECT_EQ(r2.lane<0>(), 1.0f);
-	EXPECT_EQ(r2.lane<1>(), 2.0f);
-	EXPECT_EQ(r2.lane<2>(), 3.0f);
-	EXPECT_EQ(r2.lane<3>(), 4.0f);
-}
-
 /** @brief Test vfloat4 gatherf. */
 TEST(vfloat4, gatherf)
 {
@@ -1839,12 +1817,17 @@ TEST(vint4, store_lanes_masked_unaligned)
 	EXPECT_TRUE(all(result3v == expect3v));
 }
 
-/** @brief Test vint4 pack_low_bytes. */
-TEST(vint4, pack_low_bytes)
+/** @brief Test vint4 pack_and_store_low_bytes. */
+TEST(vint4, pack_and_store_low_bytes)
 {
 	vint4 a(1, 2, 3, 4);
-	vint4 r = pack_low_bytes(a);
-	EXPECT_EQ(r.lane<0>(), (4 << 24) | (3 << 16) | (2  << 8) | (1 << 0));
+	uint8_t bytes[4] { 0 };
+	pack_and_store_low_bytes(a, bytes);
+
+	EXPECT_EQ(bytes[0], 1);
+	EXPECT_EQ(bytes[1], 2);
+	EXPECT_EQ(bytes[2], 3);
+	EXPECT_EQ(bytes[3], 4);
 }
 
 /** @brief Test vint4 select. */
@@ -2711,46 +2694,6 @@ TEST(vfloat8, select)
 	EXPECT_EQ(ra[7], 4.0f);
 }
 
-/** @brief Test vfloat8 select MSB only. */
-TEST(vfloat8, select_msb)
-{
-	int msb_set = static_cast<int>(0x80000000);
-	vint8 msb = vint8_lit(msb_set, 0, msb_set, 0, msb_set, 0, msb_set, 0);
-	vmask8 cond(msb.m);
-
-	vfloat8 a = vfloat8_lit(1.0f, 3.0f, 3.0f, 1.0f, 1.0f, 3.0f, 3.0f, 1.0f);
-	vfloat8 b = vfloat8_lit(4.0f, 2.0f, 2.0f, 4.0f, 4.0f, 2.0f, 2.0f, 4.0f);
-
-	// Select in one direction
-	vfloat8 r1 = select(a, b, cond);
-
-	alignas(32) float ra[8];
-	storea(r1, ra);
-
-	EXPECT_EQ(ra[0], 4.0f);
-	EXPECT_EQ(ra[1], 3.0f);
-	EXPECT_EQ(ra[2], 2.0f);
-	EXPECT_EQ(ra[3], 1.0f);
-	EXPECT_EQ(ra[4], 4.0f);
-	EXPECT_EQ(ra[5], 3.0f);
-	EXPECT_EQ(ra[6], 2.0f);
-	EXPECT_EQ(ra[7], 1.0f);
-
-	// Select in the other
-	vfloat8 r2 = select(b, a, cond);
-
-	storea(r2, ra);
-
-	EXPECT_EQ(ra[0], 1.0f);
-	EXPECT_EQ(ra[1], 2.0f);
-	EXPECT_EQ(ra[2], 3.0f);
-	EXPECT_EQ(ra[3], 4.0f);
-	EXPECT_EQ(ra[4], 1.0f);
-	EXPECT_EQ(ra[5], 2.0f);
-	EXPECT_EQ(ra[6], 3.0f);
-	EXPECT_EQ(ra[7], 4.0f);
-}
-
 /** @brief Test vfloat8 gatherf. */
 TEST(vfloat8, gatherf)
 {
@@ -3583,17 +3526,22 @@ TEST(vint8, store_lanes_masked_unaligned)
 	EXPECT_TRUE(all(result3v == expect3v));
 }
 
-/** @brief Test vint8 pack_low_bytes. */
-TEST(vint8, pack_low_bytes)
+/** @brief Test vint8 pack_and_store_low_bytes. */
+TEST(vint8, pack_and_store_low_bytes)
 {
 	vint8 a = vint8_lit(1, 2, 3, 4, 2, 3, 4, 5);
-	vint8 r = pack_low_bytes(a);
-
-	alignas(32) int ra[8];
-	store(r, ra);
-
-	EXPECT_EQ(ra[0], (4 << 24) | (3 << 16) | (2  << 8) | (1 << 0));
-	EXPECT_EQ(ra[1], (5 << 24) | (4 << 16) | (3  << 8) | (2 << 0));
+	uint8_t bytes[8] { 0 };
+
+	pack_and_store_low_bytes(a, bytes);
+
+	EXPECT_EQ(bytes[0], 1);
+	EXPECT_EQ(bytes[1], 2);
+	EXPECT_EQ(bytes[2], 3);
+	EXPECT_EQ(bytes[3], 4);
+	EXPECT_EQ(bytes[4], 2);
+	EXPECT_EQ(bytes[5], 3);
+	EXPECT_EQ(bytes[6], 4);
+	EXPECT_EQ(bytes[7], 5);
 }
 
 /** @brief Test vint8 select. */
diff --git a/Source/astcenc_ideal_endpoints_and_weights.cpp b/Source/astcenc_ideal_endpoints_and_weights.cpp
index 9343a0abb..3442464d5 100644
--- a/Source/astcenc_ideal_endpoints_and_weights.cpp
+++ b/Source/astcenc_ideal_endpoints_and_weights.cpp
@@ -1050,8 +1050,7 @@ void compute_quantized_weights_for_decimation(
 
 			// Invert the weight-scaling that was done initially
 			storea(ixl * rscalev + low_boundv, weight_set_out + i);
-			vint scn = pack_low_bytes(weight);
-			store_nbytes(scn, quantized_weight_set + i);
+			pack_and_store_low_bytes(weight, quantized_weight_set + i);
 		}
 	}
 	else
@@ -1084,8 +1083,7 @@ void compute_quantized_weights_for_decimation(
 
 			// Invert the weight-scaling that was done initially
 			storea(ixl * rscalev + low_boundv, weight_set_out + i);
-			vint scn = pack_low_bytes(weight);
-			store_nbytes(scn, quantized_weight_set + i);
+			pack_and_store_low_bytes(weight, quantized_weight_set + i);
 		}
 	}
 }
diff --git a/Source/astcenc_mathlib.h b/Source/astcenc_mathlib.h
index 562d6597f..959f9ba99 100644
--- a/Source/astcenc_mathlib.h
+++ b/Source/astcenc_mathlib.h
@@ -73,10 +73,14 @@
   #endif
 #endif
 
+#ifndef ASTCENC_SVE
+  #define ASTCENC_SVE 0
+#endif
+
 // Force vector-sized SIMD alignment
-#if ASTCENC_AVX
+#if ASTCENC_AVX || ASTCENC_SVE == 8
   #define ASTCENC_VECALIGN 32
-#elif ASTCENC_SSE || ASTCENC_NEON
+#elif ASTCENC_SSE || ASTCENC_NEON || ASTCENC_SVE == 4
   #define ASTCENC_VECALIGN 16
 // Use default alignment for non-SIMD builds
 #else
diff --git a/Source/astcenc_pick_best_endpoint_format.cpp b/Source/astcenc_pick_best_endpoint_format.cpp
index 6e41005bc..ccf9935fe 100644
--- a/Source/astcenc_pick_best_endpoint_format.cpp
+++ b/Source/astcenc_pick_best_endpoint_format.cpp
@@ -1308,7 +1308,6 @@ unsigned int compute_ideal_endpoint_formats(
 		vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error);
 
 		int best_error_index = hmin_s(vbest_error_index);
-
 		best_error_weights[i] = best_error_index;
 
 		// Max the error for this candidate so we don't pick it again
diff --git a/Source/astcenc_vecmathlib.h b/Source/astcenc_vecmathlib.h
index d48f1d73e..ebdcb7e5e 100644
--- a/Source/astcenc_vecmathlib.h
+++ b/Source/astcenc_vecmathlib.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2019-2022 Arm Limited
+// Copyright 2019-2024 Arm Limited
 // Copyright 2008 Jose Fonseca
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
@@ -42,11 +42,12 @@
  *
  * With the current implementation ISA support is provided for:
  *
- *     * 1-wide for scalar reference.
- *     * 4-wide for Armv8-A NEON.
- *     * 4-wide for x86-64 SSE2.
- *     * 4-wide for x86-64 SSE4.1.
- *     * 8-wide for x86-64 AVX2.
+ *     * 1-wide for scalar reference
+ *     * 4-wide for Armv8-A NEON
+ *     * 4-wide for x86-64 SSE2
+ *     * 4-wide for x86-64 SSE4.1
+ *     * 8-wide for Armv8-A SVE
+ *     * 8-wide for x86-64 AVX2
  */
 
 #ifndef ASTC_VECMATHLIB_H_INCLUDED
@@ -54,7 +55,14 @@
 
 #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
 	#include <immintrin.h>
-#elif ASTCENC_NEON != 0
+#endif
+
+#if ASTCENC_SVE != 0
+	#include <arm_sve.h>
+	#include <arm_neon_sve_bridge.h>
+#endif
+
+#if ASTCENC_NEON != 0
 	#include <arm_neon.h>
 #endif
 
@@ -106,6 +114,33 @@
 	constexpr auto loada = vfloat4::loada;
 	constexpr auto load1 = vfloat4::load1;
 
+#elif ASTCENC_SVE == 8
+	/* If we have SVE configured as 8-wide, expose 8-wide VLA. */
+	#include "astcenc_vecmathlib_neon_4.h"
+	#include "astcenc_vecmathlib_common_4.h"
+	#include "astcenc_vecmathlib_sve_8.h"
+
+	/* Check the compiler is treating SVE as 256 bits ... */
+	#if __ARM_FEATURE_SVE_BITS != 256
+		#error "__ARM_FEATURE_SVE_BITS is not 256 bits"
+	#endif
+
+	#define ASTCENC_SIMD_WIDTH 8
+
+	using vfloat = vfloat8;
+
+	#if defined(ASTCENC_NO_INVARIANCE)
+		using vfloatacc = vfloat8;
+	#else
+		using vfloatacc = vfloat4;
+	#endif
+
+	using vint = vint8;
+	using vmask = vmask8;
+
+	constexpr auto loada = vfloat8::loada;
+	constexpr auto load1 = vfloat8::load1;
+
 #elif ASTCENC_NEON > 0
 	/* If we have NEON expose 4-wide VLA. */
 	#include "astcenc_vecmathlib_neon_4.h"
@@ -239,8 +274,8 @@ ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
 ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
 {
 	vfloat z = atan(abs(y / x));
-	vmask xmask = vmask(float_as_int(x).m);
-	return change_sign(select_msb(z, vfloat(astc::PI) - z, xmask), y);
+	vmask xmask = x < vfloat::zero();
+	return change_sign(select(z, vfloat(astc::PI) - z, xmask), y);
 }
 
 /*
diff --git a/Source/astcenc_vecmathlib_avx2_8.h b/Source/astcenc_vecmathlib_avx2_8.h
index 4ef04a451..e2f86a1ce 100644
--- a/Source/astcenc_vecmathlib_avx2_8.h
+++ b/Source/astcenc_vecmathlib_avx2_8.h
@@ -529,7 +529,7 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
 /**
  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  */
-ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v)
+ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint8 v, uint8_t* p)
 {
 	__m256i shuf = _mm256_set_epi8(0, 0, 0, 0,  0,  0,  0,  0,
 	                               0, 0, 0, 0, 28, 24, 20, 16,
@@ -541,7 +541,8 @@ ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v)
 	__m128i b = _mm_unpacklo_epi32(a0, a1);
 
 	__m256i r = astcenc_mm256_set_m128i(b, b);
-	return vint8(r);
+
+	store_nbytes(vint8(r), p);
 }
 
 /**
@@ -844,14 +845,6 @@ ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond)
 	return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
 }
 
-/**
- * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
- */
-ASTCENC_SIMD_INLINE vfloat8 select_msb(vfloat8 a, vfloat8 b, vmask8 cond)
-{
-	return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
-}
-
 /**
  * @brief Accumulate lane-wise sums for a vector, folded 4-wide.
  *
diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h
index b0187630b..8d94bcdeb 100644
--- a/Source/astcenc_vecmathlib_neon_4.h
+++ b/Source/astcenc_vecmathlib_neon_4.h
@@ -115,7 +115,7 @@ struct vfloat4
 	 */
 	static ASTCENC_SIMD_INLINE vfloat4 zero()
 	{
-		return vfloat4(vdupq_n_f32(0.0f));
+		return vfloat4(0.0f);
 	}
 
 	/**
@@ -603,16 +603,17 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
 }
 
 /**
- * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
+ * @brief Pack and store low 8 bits of each vector lane.
  */
-ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
+ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* data)
 {
 	alignas(16) uint8_t shuf[16] {
 		0, 4, 8, 12,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0
 	};
 	uint8x16_t idx = vld1q_u8(shuf);
 	int8x16_t av = vreinterpretq_s8_s32(a.m);
-	return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
+	a = vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
+	store_nbytes(a, data);
 }
 
 /**
@@ -790,21 +791,12 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
 	return vfloat4(vbslq_f32(cond.m, b.m, a.m));
 }
 
-/**
- * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
- */
-ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
-{
-	static const uint32x4_t msb = vdupq_n_u32(0x80000000u);
-	uint32x4_t mask = vcgeq_u32(cond.m, msb);
-	return vfloat4(vbslq_f32(mask, b.m, a.m));
-}
-
 /**
  * @brief Load a vector of gathered results from an array;
  */
 ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
 {
+#if ASTCENC_SVE == 0
 	alignas(16) int idx[4];
 	storea(indices, idx);
 	alignas(16) float vals[4];
@@ -813,6 +805,11 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
 	vals[2] = base[idx[2]];
 	vals[3] = base[idx[3]];
 	return vfloat4(vals);
+#else
+	svint32_t offsets = svset_neonq_s32(svundef_s32(), indices.m);
+	svfloat32_t	data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets);
+	return vfloat4(svget_neonq_f32(data));
+#endif
 }
 
 /**
diff --git a/Source/astcenc_vecmathlib_none_4.h b/Source/astcenc_vecmathlib_none_4.h
index 8e2c57bc2..11c6d62ab 100644
--- a/Source/astcenc_vecmathlib_none_4.h
+++ b/Source/astcenc_vecmathlib_none_4.h
@@ -679,7 +679,7 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
 /**
  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  */
-ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
+ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* p)
 {
 	int b0 = a.m[0] & 0xFF;
 	int b1 = a.m[1] & 0xFF;
@@ -687,7 +687,8 @@ ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
 	int b3 = a.m[3] & 0xFF;
 
 	int b = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);
-	return vint4(b, 0, 0, 0);
+	a = vint4(b, 0, 0, 0);
+	store_nbytes(a, p);
 }
 
 /**
@@ -915,17 +916,6 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
 	               (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
 }
 
-/**
- * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
- */
-ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
-{
-	return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
-	               (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
-	               (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
-	               (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
-}
-
 /**
  * @brief Load a vector of gathered results from an array;
  */
diff --git a/Source/astcenc_vecmathlib_sse_4.h b/Source/astcenc_vecmathlib_sse_4.h
index 163171cfd..c47d73190 100644
--- a/Source/astcenc_vecmathlib_sse_4.h
+++ b/Source/astcenc_vecmathlib_sse_4.h
@@ -658,15 +658,17 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
 /**
  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  */
-ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
+ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* p)
 {
 #if ASTCENC_SSE >= 41
 	__m128i shuf = _mm_set_epi8(0,0,0,0, 0,0,0,0, 0,0,0,0, 12,8,4,0);
-	return vint4(_mm_shuffle_epi8(a.m, shuf));
+	a = vint4(_mm_shuffle_epi8(a.m, shuf));
+	store_nbytes(a, p);
 #else
 	__m128i va = _mm_unpacklo_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(1,1,1,1)));
 	__m128i vb = _mm_unpackhi_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(3,3,3,3)));
-	return vint4(_mm_unpacklo_epi16(va, vb));
+	a = vint4(_mm_unpacklo_epi16(va, vb));
+	store_nbytes(a, p);
 #endif
 }
 
@@ -877,19 +879,6 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
 #endif
 }
 
-/**
- * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
- */
-ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
-{
-#if ASTCENC_SSE >= 41
-	return vfloat4(_mm_blendv_ps(a.m, b.m, cond.m));
-#else
-	__m128 d = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(cond.m), 31));
-	return vfloat4(_mm_or_ps(_mm_and_ps(d, b.m), _mm_andnot_ps(d, a.m)));
-#endif
-}
-
 /**
  * @brief Load a vector of gathered results from an array;
  */
diff --git a/Source/astcenc_vecmathlib_sve_8.h b/Source/astcenc_vecmathlib_sve_8.h
new file mode 100644
index 000000000..85c91bffe
--- /dev/null
+++ b/Source/astcenc_vecmathlib_sve_8.h
@@ -0,0 +1,1089 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2019-2024 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief 8x32-bit vectors, implemented using SVE.
+ *
+ * This module implements 8-wide 32-bit float, int, and mask vectors for Arm
+ * SVE.
+ *
+ * There is a baseline level of functionality provided by all vector widths and
+ * implementations. This is implemented using identical function signatures,
+ * modulo data type, so we can use them as substitutable implementations in VLA
+ * code.
+ */
+
+#ifndef ASTC_VECMATHLIB_sve_8_H_INCLUDED
+#define ASTC_VECMATHLIB_sve_8_H_INCLUDED
+
+#ifndef ASTCENC_SIMD_INLINE
+	#error "Include astcenc_vecmathlib.h, do not include directly"
+#endif
+
+#include <cstdio>
+
+typedef svbool_t svbool_8_t __attribute__((arm_sve_vector_bits(256)));
+typedef svuint8_t svuint8_8_t __attribute__((arm_sve_vector_bits(256)));
+typedef svuint16_t svuint16_8_t __attribute__((arm_sve_vector_bits(256)));
+typedef svuint32_t svuint32_8_t __attribute__((arm_sve_vector_bits(256)));
+typedef svint32_t svint32_8_t __attribute__((arm_sve_vector_bits(256)));
+typedef svfloat32_t svfloat32_8_t __attribute__((arm_sve_vector_bits(256)));
+
+// ============================================================================
+// vfloat8 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 8-wide floats.
+ */
+struct vfloat8
+{
+	/**
+	 * @brief Construct from zero-initialized value.
+	 */
+	ASTCENC_SIMD_INLINE vfloat8() = default;
+
+	/**
+	 * @brief Construct from 8 values loaded from an unaligned address.
+	 *
+	 * Consider using loada() which is better with vectors if data is aligned
+	 * to vector length.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat8(const float *p)
+	{
+		m = svld1_f32(svptrue_b32(), p);
+	}
+
+	/**
+	 * @brief Construct from 1 scalar value replicated across all lanes.
+	 *
+	 * Consider using zero() for constexpr zeros.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat8(float a)
+	{
+		m = svdup_f32(a);
+	}
+
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat8(svfloat32_8_t a)
+	{
+		m = a;
+	}
+
+	/**
+	 * @brief Factory that returns a vector of zeros.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat8 zero()
+	{
+		return vfloat8(0.0f);
+	}
+
+	/**
+	 * @brief Factory that returns a replicated scalar loaded from memory.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat8 load1(const float* p)
+	{
+		return vfloat8(*p);
+	}
+
+	/**
+	 * @brief Factory that returns a vector loaded from 32B aligned memory.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat8 loada(const float* p)
+	{
+		return vfloat8(p);
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	svfloat32_8_t m;
+};
+
+// ============================================================================
+// vint8 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 8-wide ints.
+ */
+struct vint8
+{
+	/**
+	 * @brief Construct from zero-initialized value.
+	 */
+	ASTCENC_SIMD_INLINE vint8() = default;
+
+	/**
+	 * @brief Construct from 8 values loaded from an unaligned address.
+	 *
+	 * Consider using loada() which is better with vectors if data is aligned
+	 * to vector length.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint8(const int *p)
+	{
+		m = svld1_s32(svptrue_b32(), p);
+	}
+
+	/**
+	 * @brief Construct from 8 uint8_t loaded from an unaligned address.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint8(const uint8_t *p)
+	{
+		// Load 8 byte values
+		svbool_8_t pred = svptrue_pat_b8(SV_VL8);
+		svuint8_8_t m8 = svld1_u8(pred, p);
+
+		// Expand to 32-bits
+		svuint16_8_t m16 = svunpklo_u16(m8);
+		svuint32_8_t m32 = svunpklo_u32(m16);
+		m = svreinterpret_s32_u32(m32);
+	}
+
+	/**
+	 * @brief Construct from 1 scalar value replicated across all lanes.
+	 *
+	 * Consider using vfloat8::zero() for constexpr zeros.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint8(int a)
+	{
+		m = svdup_s32(a);
+	}
+
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint8(svint32_8_t a)
+	{
+		m = a;
+	}
+
+	/**
+	 * @brief Factory that returns a vector of zeros.
+	 */
+	static ASTCENC_SIMD_INLINE vint8 zero()
+	{
+		return vint8(0.0f);
+	}
+
+	/**
+	 * @brief Factory that returns a replicated scalar loaded from memory.
+	 */
+	static ASTCENC_SIMD_INLINE vint8 load1(const int* p)
+	{
+		return vint8(*p);
+	}
+
+	/**
+	 * @brief Factory that returns a vector loaded from unaligned memory.
+	 */
+	static ASTCENC_SIMD_INLINE vint8 load(const uint8_t* p)
+	{
+		svuint8_8_t data = svld1_u8(svptrue_b8(), p);
+		return vint8(svreinterpret_s32_u8(data));
+	}
+
+	/**
+	 * @brief Factory that returns a vector loaded from 32B aligned memory.
+	 */
+	static ASTCENC_SIMD_INLINE vint8 loada(const int* p)
+	{
+		return vint8(p);
+	}
+
+	/**
+	 * @brief Factory that returns a vector containing the lane IDs.
+	 */
+	static ASTCENC_SIMD_INLINE vint8 lane_id()
+	{
+		return vint8(svindex_s32(0, 1));
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	 svint32_8_t m;
+};
+
+// ============================================================================
+// vmask8 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 8-wide control plane masks.
+ */
+struct vmask8
+{
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask8(svbool_8_t a)
+	{
+		m = a;
+	}
+
+	/**
+	 * @brief Construct from 1 scalar value.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask8(bool a)
+	{
+		m = svdup_b32(a);
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	svbool_8_t m;
+};
+
+// ============================================================================
+// vmask8 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: mask union (or).
+ */
+ASTCENC_SIMD_INLINE vmask8 operator|(vmask8 a, vmask8 b)
+{
+	return vmask8(svorr_z(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask intersect (and).
+ */
+ASTCENC_SIMD_INLINE vmask8 operator&(vmask8 a, vmask8 b)
+{
+	return vmask8(svand_z(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask difference (xor).
+ */
+ASTCENC_SIMD_INLINE vmask8 operator^(vmask8 a, vmask8 b)
+{
+	return vmask8(sveor_z(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask invert (not).
+ */
+ASTCENC_SIMD_INLINE vmask8 operator~(vmask8 a)
+{
+	return vmask8(svnot_z(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Return a 8-bit mask code indicating mask status.
+ *
+ * bit0 = lane 0
+ */
+ASTCENC_SIMD_INLINE unsigned int mask(vmask8 a)
+{
+	alignas(32) const int shifta[8] { 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 };
+	svint32_8_t template_vals = svld1_s32(svptrue_b32(), shifta);
+	svint32_8_t active_vals = svsel_s32(a.m, template_vals, svdup_s32(0));
+	return static_cast<unsigned int>(svaddv_s32(svptrue_b32(), active_vals));
+}
+
+/**
+ * @brief True if any lanes are enabled, false otherwise.
+ */
+ASTCENC_SIMD_INLINE bool any(vmask8 a)
+{
+	return svptest_any(svptrue_b32(), a.m);
+}
+
+/**
+ * @brief True if all lanes are enabled, false otherwise.
+ */
+ASTCENC_SIMD_INLINE bool all(vmask8 a)
+{
+	return !svptest_any(svptrue_b32(), (~a).m);
+}
+
+// ============================================================================
+// vint8 operators and functions
+// ============================================================================
+/**
+ * @brief Overload: vector by vector addition.
+ */
+ASTCENC_SIMD_INLINE vint8 operator+(vint8 a, vint8 b)
+{
+	return vint8(svadd_s32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector incremental addition.
+ */
+ASTCENC_SIMD_INLINE vint8& operator+=(vint8& a, const vint8& b)
+{
+	a = a + b;
+	return a;
+}
+
+/**
+ * @brief Overload: vector by vector subtraction.
+ */
+ASTCENC_SIMD_INLINE vint8 operator-(vint8 a, vint8 b)
+{
+	return vint8(svsub_s32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vint8 operator*(vint8 a, vint8 b)
+{
+	return vint8(svmul_s32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector bit invert.
+ */
+ASTCENC_SIMD_INLINE vint8 operator~(vint8 a)
+{
+	return vint8(svnot_s32_x(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise or.
+ */
+ASTCENC_SIMD_INLINE vint8 operator|(vint8 a, vint8 b)
+{
+	return vint8(svorr_s32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise and.
+ */
+ASTCENC_SIMD_INLINE vint8 operator&(vint8 a, vint8 b)
+{
+	return vint8(svand_s32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise xor.
+ */
+ASTCENC_SIMD_INLINE vint8 operator^(vint8 a, vint8 b)
+{
+	return vint8(sveor_s32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector equality.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator==(vint8 a, vint8 b)
+{
+	return vmask8(svcmpeq_s32(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector inequality.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator!=(vint8 a, vint8 b)
+{
+	return vmask8(svcmpne_s32(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector less than.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator<(vint8 a, vint8 b)
+{
+	return vmask8(svcmplt_s32(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector greater than.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator>(vint8 a, vint8 b)
+{
+	return vmask8(svcmpgt_s32(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Logical shift left.
+ */
+template <int s> ASTCENC_SIMD_INLINE vint8 lsl(vint8 a)
+{
+	return vint8(svlsl_n_s32_x(svptrue_b32(), a.m, s));
+}
+
+/**
+ * @brief Arithmetic shift right.
+ */
+template <int s> ASTCENC_SIMD_INLINE vint8 asr(vint8 a)
+{
+	return vint8(svasr_n_s32_x(svptrue_b32(), a.m, s));
+}
+
+/**
+ * @brief Logical shift right.
+ */
+template <int s> ASTCENC_SIMD_INLINE vint8 lsr(vint8 a)
+{
+	svuint32_8_t r = svreinterpret_u32_s32(a.m);
+	r = svlsr_n_u32_x(svptrue_b32(), r, s);
+	return vint8(svreinterpret_s32_u32(r));
+}
+
+/**
+ * @brief Return the min vector of two vectors.
+ */
+ASTCENC_SIMD_INLINE vint8 min(vint8 a, vint8 b)
+{
+	return vint8(svmin_s32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Return the max vector of two vectors.
+ */
+ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b)
+{
+	return vint8(svmax_s32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
+{
+	return vint8(svminv_s32(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmin_s(vint8 a)
+{
+	return svminv_s32(svptrue_b32(), a.m);
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
+{
+	return vint8(svmaxv_s32(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmax_s(vint8 a)
+{
+	return svmaxv_s32(svptrue_b32(), a.m);
+}
+
+/**
+ * @brief Store a vector to a 16B aligned memory address.
+ */
+ASTCENC_SIMD_INLINE void storea(vint8 a, int* p)
+{
+	svst1_s32(svptrue_b32(), p, a.m);
+}
+
+/**
+ * @brief Store a vector to an unaligned memory address.
+ */
+ASTCENC_SIMD_INLINE void store(vint8 a, int* p)
+{
+	svst1_s32(svptrue_b32(), p, a.m);
+}
+
+/**
+ * @brief Store lowest N (vector width) bytes into an unaligned address.
+ */
+ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
+{
+	svuint8_8_t r = svreinterpret_u8_s32(a.m);
+	svst1_u8(svptrue_pat_b8(SV_VL8), p, r);
+}
+
+/**
+ * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
+ */
+ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint8 v, uint8_t* p)
+{
+	svuint32_8_t data = svreinterpret_u32_s32(v.m);
+	svst1b_u32(svptrue_b32(), p, data);
+}
+
+/**
+ * @brief Return lanes from @c b if @c cond is set, else @c a.
+ */
+ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond)
+{
+	return vint8(svsel_s32(cond.m, b.m, a.m));
+}
+
+// ============================================================================
+// vfloat8 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by vector addition.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator+(vfloat8 a, vfloat8 b)
+{
+	return vfloat8(svadd_f32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector incremental addition.
+ */
+ASTCENC_SIMD_INLINE vfloat8& operator+=(vfloat8& a, const vfloat8& b)
+{
+	a = a + b;
+	return a;
+}
+
+/**
+ * @brief Overload: vector by vector subtraction.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator-(vfloat8 a, vfloat8 b)
+{
+	return vfloat8(svsub_f32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, vfloat8 b)
+{
+	return vfloat8(svmul_f32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by scalar multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, float b)
+{
+	return vfloat8(svmul_f32_x(svptrue_b32(), a.m, svdup_f32(b)));
+}
+
+/**
+ * @brief Overload: scalar by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator*(float a, vfloat8 b)
+{
+	return vfloat8(svmul_f32_x(svptrue_b32(), svdup_f32(a), b.m));
+}
+
+/**
+ * @brief Overload: vector by vector division.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, vfloat8 b)
+{
+	return vfloat8(svdiv_f32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by scalar division.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b)
+{
+	return vfloat8(svdiv_f32_x(svptrue_b32(), a.m, svdup_f32(b)));
+}
+
+
+/**
+ * @brief Overload: scalar by vector division.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b)
+{
+	return vfloat8(svdiv_f32_x(svptrue_b32(), svdup_f32(a), b.m));
+}
+
+
+/**
+ * @brief Overload: vector by vector equality.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator==(vfloat8 a, vfloat8 b)
+{
+	return vmask8(svcmpeq_f32(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector inequality.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator!=(vfloat8 a, vfloat8 b)
+{
+	return vmask8(svcmpne_f32(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector less than.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator<(vfloat8 a, vfloat8 b)
+{
+	return vmask8(svcmplt_f32(svptrue_b32(), a.m, b.m));;
+}
+
+/**
+ * @brief Overload: vector by vector greater than.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator>(vfloat8 a, vfloat8 b)
+{
+	return vmask8(svcmpgt_f32(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector less than or equal.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator<=(vfloat8 a, vfloat8 b)
+{
+	return vmask8(svcmple_f32(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector greater than or equal.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator>=(vfloat8 a, vfloat8 b)
+{
+	return vmask8(svcmpge_f32(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Return the min vector of two vectors.
+ *
+ * If either lane value is NaN, the other lane will be returned.
+ */
+ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, vfloat8 b)
+{
+	return vfloat8(svminnm_f32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Return the min vector of a vector and a scalar.
+ *
+ * If either lane value is NaN, the other lane will be returned.
+ */
+ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, float b)
+{
+	return min(a, vfloat8(b));
+}
+
+/**
+ * @brief Return the max vector of two vectors.
+ *
+ * If either lane value is NaN, the other lane will be returned.
+ */
+ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, vfloat8 b)
+{
+	return vfloat8(svmaxnm_f32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Return the max vector of a vector and a scalar.
+ *
+ * If either lane value is NaN, the other lane will be returned.
+ */
+ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, float b)
+{
+	return max(a, vfloat8(b));
+}
+
+/**
+ * @brief Return the clamped value between min and max.
+ *
+ * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
+ * then @c min will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat8 clamp(float minv, float maxv, vfloat8 a)
+{
+	return min(max(a, minv), maxv);
+}
+
+/**
+ * @brief Return a clamped value between 0.0f and 1.0f.
+ *
+ * If @c a is NaN then zero will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat8 clampzo(vfloat8 a)
+{
+	return clamp(0.0f, 1.0f, a);
+}
+
+/**
+ * @brief Return the absolute value of the float vector.
+ */
+ASTCENC_SIMD_INLINE vfloat8 abs(vfloat8 a)
+{
+	return vfloat8(svabs_f32_x(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Return a float rounded to the nearest integer value.
+ */
+ASTCENC_SIMD_INLINE vfloat8 round(vfloat8 a)
+{
+	return vfloat8(svrintn_f32_x(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
+{
+	return vfloat8(svminnmv_f32(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
+{
+	return svminnmv_f32(svptrue_b32(), a.m);
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
+{
+	return vfloat8(svmaxnmv_f32(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
+{
+	return svmaxnmv_f32(svptrue_b32(), a.m);
+}
+
+/**
+ * @brief Return the horizontal sum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hadd_s(vfloat8 a)
+{
+	// Can't use svaddv - it's not invariant
+	vfloat4 lo(svget_neonq_f32(a.m));
+	vfloat4 hi(svget_neonq_f32(svext_f32(a.m, a.m, 4)));
+	return hadd_s(lo) + hadd_s(hi);
+}
+
+/**
+ * @brief Return lanes from @c b if @c cond is set, else @c a.
+ */
+ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond)
+{
+	return vfloat8(svsel_f32(cond.m, b.m, a.m));
+}
+
+/**
+ * @brief Accumulate lane-wise sums for a vector, folded 4-wide.
+ *
+ * This is invariant with 4-wide implementations.
+ */
+ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a)
+{
+	vfloat4 lo(svget_neonq_f32(a.m));
+	haccumulate(accum, lo);
+
+	vfloat4 hi(svget_neonq_f32(svext_f32(a.m, a.m, 4)));
+	haccumulate(accum, hi);
+}
+
+/**
+ * @brief Accumulate lane-wise sums for a vector.
+ *
+ * This is NOT invariant with 4-wide implementations.
+ */
+ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a)
+{
+	accum += a;
+}
+
+/**
+ * @brief Accumulate masked lane-wise sums for a vector, folded 4-wide.
+ *
+ * This is invariant with 4-wide implementations.
+ */
+ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a, vmask8 m)
+{
+	a = select(vfloat8::zero(), a, m);
+	haccumulate(accum, a);
+}
+
+/**
+ * @brief Accumulate masked lane-wise sums for a vector.
+ *
+ * This is NOT invariant with 4-wide implementations.
+ */
+ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a, vmask8 m)
+{
+	accum.m = svadd_f32_m(m.m, accum.m, a.m);
+}
+
+/**
+ * @brief Return the sqrt of the lanes in the vector.
+ */
+ASTCENC_SIMD_INLINE vfloat8 sqrt(vfloat8 a)
+{
+	return vfloat8(svsqrt_f32_x(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Load a vector of gathered results from an array;
+ */
+ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
+{
+	return vfloat8(svld1_gather_s32index_f32(svptrue_b32(), base, indices.m));
+}
+
+/**
+ * @brief Store a vector to an unaligned memory address.
+ */
+ASTCENC_SIMD_INLINE void store(vfloat8 a, float* p)
+{
+	svst1_f32(svptrue_b32(), p, a.m);
+}
+
+/**
+ * @brief Store a vector to a 32B aligned memory address.
+ */
+ASTCENC_SIMD_INLINE void storea(vfloat8 a, float* p)
+{
+	svst1_f32(svptrue_b32(), p, a.m);
+}
+
+/**
+ * @brief Return a integer value for a float vector, using truncation.
+ */
+ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a)
+{
+	return vint8(svcvt_s32_f32_x(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Return a integer value for a float vector, using round-to-nearest.
+ */
+ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a)
+{
+	a = a + vfloat8(0.5f);
+	return vint8(svcvt_s32_f32_x(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Return a float value for an integer vector.
+ */
+ASTCENC_SIMD_INLINE vfloat8 int_to_float(vint8 a)
+{
+	return vfloat8(svcvt_f32_s32_x(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Return a float value as an integer bit pattern (i.e. no conversion).
+ *
+ * It is a common trick to convert floats into integer bit patterns, perform
+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
+ * convert them back again. This is the first half of that flip.
+ */
+ASTCENC_SIMD_INLINE vint8 float_as_int(vfloat8 a)
+{
+	return vint8(svreinterpret_s32_f32(a.m));
+}
+
+/**
+ * @brief Return a integer value as a float bit pattern (i.e. no conversion).
+ *
+ * It is a common trick to convert floats into integer bit patterns, perform
+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
+ * convert them back again. This is the second half of that flip.
+ */
+ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
+{
+	return vfloat8(svreinterpret_f32_s32(a.m));
+}
+
+/**
+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p)
+{
+	t0p = vint8(svdup_neonq_f32(t0.m));
+}
+
+/**
+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p)
+{
+	// 8-wide SVE uses a single table register, so t1 is unused
+	(void)t1p;
+
+	svfloat32_8_t t0v = svdup_neonq_f32(t0.m);
+	svfloat32_8_t t1v = svdup_neonq_f32(t1.m);
+
+	t0p = vint8(svext_f32(t0v, t1v, 4));
+}
+
+/**
+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
+	vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p)
+{
+	// 8-wide SVE uses a two table registers, so t2 and t3 are unused
+	(void)t2p;
+	(void)t3p;
+
+	svfloat32_8_t t0v = svdup_neonq_f32(t0.m);
+	svfloat32_8_t t1v = svdup_neonq_f32(t1.m);
+	svfloat32_8_t t2v = svdup_neonq_f32(t2.m);
+	svfloat32_8_t t3v = svdup_neonq_f32(t3.m);
+
+	t0p = vint8(svext_f32(t0v, t1v, 4));
+	t1p = vint8(svext_f32(t2v, t3v, 4));
+}
+
+/**
+ * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ */
+ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx)
+{
+	// Set index byte above max index for unused bytes so table lookup returns zero
+	svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));
+
+	svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked);
+	svuint8_8_t tbl_bytes = svreinterpret_u8_s32(t0.m);
+	svuint8_8_t result = svtbl_u8(tbl_bytes, idx_bytes);
+
+	return vint8(svreinterpret_s32_u8(result));
+}
+
+/**
+ * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ */
+ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx)
+{
+	// 8-wide SVE uses a single table register, so t1 is unused
+	(void)t1;
+
+	// Set index byte above max index for unused bytes so table lookup returns zero
+	svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));
+
+	svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked);
+	svuint8_8_t tbl_bytes = svreinterpret_u8_s32(t0.m);
+	svuint8_8_t result = svtbl_u8(tbl_bytes, idx_bytes);
+
+	return vint8(svreinterpret_s32_u8(result));
+}
+
+/**
+ * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ */
+ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx)
+{
+	// 8-wide SVE uses a two table registers, so t2 and t3 are unused
+	(void)t2;
+	(void)t3;
+
+	// Set index byte above max index for unused bytes so table lookup returns zero
+	svint32_8_t literal32 = svdup_s32(32);
+	svbool_8_t idx_lo_select = svcmplt(svptrue_b32(), idx.m, literal32);
+	svint32_8_t idx_lo_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));
+	svint32_8_t idx_hi_masked = svorr_s32_x(svptrue_b32(), idx.m - literal32, svdup_s32(0xFFFFFF00));
+
+	svuint8_8_t idx_lo_bytes = svreinterpret_u8_s32(idx_lo_masked);
+	svuint8_8_t idx_hi_bytes = svreinterpret_u8_s32(idx_hi_masked);
+
+	svuint8_8_t tbl0_bytes = svreinterpret_u8_s32(t0.m);
+	svuint8_8_t tbl1_bytes = svreinterpret_u8_s32(t1.m);
+
+	svint32_8_t t0_lookup = svreinterpret_s32_u8(svtbl_u8(tbl0_bytes, idx_lo_bytes));
+	svint32_8_t t1_lookup = svreinterpret_s32_u8(svtbl_u8(tbl1_bytes, idx_hi_bytes));
+
+	svint32_8_t result = svsel_s32(idx_lo_select, t0_lookup, t1_lookup);
+
+	// Future: SVE2 can directly do svtbl2_u8() for a two register table
+	return vint8(result);
+}
+
+/**
+ * @brief Return a vector of interleaved RGBA data.
+ *
+ * Input vectors have the value stored in the bottom 8 bits of each lane,
+ * with high bits set to zero.
+ *
+ * Output vector stores a single RGBA texel packed in each lane.
+ */
+ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a)
+{
+	return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
+}
+
+/**
+ * @brief Store a vector, skipping masked lanes.
+ *
+ * All masked lanes must be at the end of vector, after all non-masked lanes.
+ */
+ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 mask)
+{
+	svst1_u32(mask.m, reinterpret_cast<uint32_t*>(base), data.m);
+}
+
+/**
+ * @brief Debug function to print a vector of ints.
+ */
+ASTCENC_SIMD_INLINE void print(vint8 a)
+{
+	alignas(32) int v[8];
+	storea(a, v);
+	printf("v8_i32:\n  %8d %8d %8d %8d %8d %8d %8d %8d\n",
+	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
+}
+
+/**
+ * @brief Debug function to print a vector of ints.
+ */
+ASTCENC_SIMD_INLINE void printx(vint8 a)
+{
+	alignas(32) int v[8];
+	storea(a, v);
+	printf("v8_i32:\n  %08x %08x %08x %08x %08x %08x %08x %08x\n",
+	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
+}
+
+/**
+ * @brief Debug function to print a vector of floats.
+ */
+ASTCENC_SIMD_INLINE void print(vfloat8 a)
+{
+	alignas(32) float v[8];
+	storea(a, v);
+	printf("v8_f32:\n  %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
+	       static_cast<double>(v[0]), static_cast<double>(v[1]),
+	       static_cast<double>(v[2]), static_cast<double>(v[3]),
+	       static_cast<double>(v[4]), static_cast<double>(v[5]),
+	       static_cast<double>(v[6]), static_cast<double>(v[7]));
+}
+
+/**
+ * @brief Debug function to print a vector of masks.
+ */
+ASTCENC_SIMD_INLINE void print(vmask8 a)
+{
+	print(select(vint8(0), vint8(1), a));
+}
+
+#endif // #ifndef ASTC_VECMATHLIB_sve_8_H_INCLUDED
diff --git a/Source/astcenccli_entry.cpp b/Source/astcenccli_entry.cpp
index 3c56fc225..ab0b95923 100644
--- a/Source/astcenccli_entry.cpp
+++ b/Source/astcenccli_entry.cpp
@@ -37,6 +37,7 @@ int astcenc_main(
 	int argc,
 	char **argv);
 
+// x86-64 builds
 #if (ASTCENC_SSE > 20)    || (ASTCENC_AVX > 0) || \
     (ASTCENC_POPCNT > 0) || (ASTCENC_F16C > 0)
 
@@ -254,6 +255,36 @@ static bool validate_cpu_isa()
 	return true;
 }
 
+// Validate Arm SVE availability
+#elif ASTCENC_SVE != 0
+
+#include <sys/auxv.h>
+static bool cpu_supports_sve_256()
+{
+    long hwcaps = getauxval(AT_HWCAP);
+	return (hwcaps & HWCAP_SVE) != 0;
+}
+
+/**
+ * @brief Print a string to stderr.
+ */
+static inline void print_error(
+	const char* format
+) {
+	fprintf(stderr, "%s", format);
+}
+
+static bool validate_cpu_isa()
+{
+	if (!cpu_supports_sve_256())
+	{
+		print_error("ERROR: Host does not support SVE ISA extension\n");
+		return false;
+	}
+
+	return true;
+}
+
 #else
 
 // Fallback for cases with no dynamic ISA availability
diff --git a/Source/astcenccli_image.cpp b/Source/astcenccli_image.cpp
index 4b1bb637a..237da60c7 100644
--- a/Source/astcenccli_image.cpp
+++ b/Source/astcenccli_image.cpp
@@ -350,8 +350,7 @@ uint8_t* unorm8x4_array_from_astc_img(
 				color = clamp(0.0f, 1.0f, color) * 255.0f;
 
 				colori = float_to_int_rtn(color);
-				pack_low_bytes(colori);
-				store_nbytes(colori, dst + 4 * x);
+				pack_and_store_low_bytes(colori, dst + 4 * x);
 			}
 		}
 	}
diff --git a/Source/astcenccli_toplevel.cpp b/Source/astcenccli_toplevel.cpp
index 39eb586af..dd0941b65 100644
--- a/Source/astcenccli_toplevel.cpp
+++ b/Source/astcenccli_toplevel.cpp
@@ -1382,8 +1382,7 @@ static void image_set_pixel_u8(
 	assert(img.data_type == ASTCENC_TYPE_U8);
 
 	uint8_t* data = static_cast<uint8_t*>(img.data[0]);
-	pixel = pack_low_bytes(pixel);
-	store_nbytes(pixel, data + (4 * img.dim_x * y) + (4 * x    ));
+	pack_and_store_low_bytes(pixel, data + (4 * img.dim_x * y) + (4 * x));
 }
 
 /**
@@ -1886,6 +1885,18 @@ int astcenc_main(
 	int argc,
 	char **argv
 ) {
+#if ASTCENC_SVE != 0
+	// Do this check here because is needs SVE instructions so cannot be in
+	// the veneer check which is compiled as stock Armv8. We know we have SVE
+	// by the time we get this far, but not the vector width.
+	if (svcntw() != ASTCENC_SVE)
+	{
+		uint32_t bits = ASTCENC_SVE * 32;
+		print_error("ERROR: Host does not implement %u bit SVE ISA extension\n", bits);
+		return false;
+	}
+#endif
+
 	double start_time = get_time();
 
 	if (argc < 2)
diff --git a/Source/astcenccli_toplevel_help.cpp b/Source/astcenccli_toplevel_help.cpp
index 71b9a42d0..b01da6bd5 100644
--- a/Source/astcenccli_toplevel_help.cpp
+++ b/Source/astcenccli_toplevel_help.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2023 Arm Limited
+// Copyright 2011-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -560,6 +560,8 @@ void astcenc_print_header()
 	const char* simdtype = "sse4.1";
 #elif (ASTCENC_SSE == 20)
 	const char* simdtype = "sse2";
+#elif (ASTCENC_SVE == 8)
+	const char* simdtype = "sve.256b";
 #elif (ASTCENC_NEON == 1)
 	const char* simdtype = "neon";
 #else
diff --git a/Source/cmake_core.cmake b/Source/cmake_core.cmake
index e78eb70bb..172b89502 100644
--- a/Source/cmake_core.cmake
+++ b/Source/cmake_core.cmake
@@ -271,6 +271,7 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_IS_VENEER)
         target_compile_definitions(${ASTCENC_TARGET_NAME}
             PRIVATE
                 ASTCENC_NEON=0
+                ASTCENC_SVE=0
                 ASTCENC_SSE=0
                 ASTCENC_AVX=0
                 ASTCENC_POPCNT=0
@@ -280,6 +281,7 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_IS_VENEER)
         target_compile_definitions(${ASTCENC_TARGET_NAME}
             PRIVATE
                 ASTCENC_NEON=1
+                ASTCENC_SVE=0
                 ASTCENC_SSE=0
                 ASTCENC_AVX=0
                 ASTCENC_POPCNT=0
@@ -293,10 +295,28 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_IS_VENEER)
                     $<${is_msvccl}:/d2ssa-cfg-sink->)
         endif()
 
+    elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_256")
+        target_compile_definitions(${ASTCENC_TARGET_NAME}
+            PRIVATE
+                ASTCENC_NEON=1
+                ASTCENC_SVE=8
+                ASTCENC_SSE=0
+                ASTCENC_AVX=0
+                ASTCENC_POPCNT=0
+                ASTCENC_F16C=0)
+
+        # Enable SVE
+        if (NOT ${ASTCENC_IS_VENEER})
+            target_compile_options(${ASTCENC_TARGET_NAME}
+                PRIVATE
+                    -march=armv8-a+sve -msve-vector-bits=256)
+        endif()
+
     elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
         target_compile_definitions(${ASTCENC_TARGET_NAME}
             PRIVATE
                 ASTCENC_NEON=0
+                ASTCENC_SVE=0
                 ASTCENC_SSE=20
                 ASTCENC_AVX=0
                 ASTCENC_POPCNT=0
@@ -314,6 +334,7 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_IS_VENEER)
         target_compile_definitions(${ASTCENC_TARGET_NAME}
             PRIVATE
                 ASTCENC_NEON=0
+                ASTCENC_SVE=0
                 ASTCENC_SSE=41
                 ASTCENC_AVX=0
                 ASTCENC_POPCNT=1
@@ -338,6 +359,7 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_IS_VENEER)
         target_compile_definitions(${ASTCENC_TARGET_NAME}
             PRIVATE
                 ASTCENC_NEON=0
+                ASTCENC_SVE=0
                 ASTCENC_SSE=41
                 ASTCENC_AVX=2
                 ASTCENC_POPCNT=1
diff --git a/Test/astc_test_image.py b/Test/astc_test_image.py
index 141b7015c..df4e51115 100644
--- a/Test/astc_test_image.py
+++ b/Test/astc_test_image.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
 # -----------------------------------------------------------------------------
-# Copyright 2019-2023 Arm Limited
+# Copyright 2019-2024 Arm Limited
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not
 # use this file except in compliance with the License. You may obtain a copy
@@ -289,13 +289,12 @@ def parse_command_line():
     refcoders = ["ref-1.7",
                  "ref-2.5-neon", "ref-2.5-sse2", "ref-2.5-sse4.1", "ref-2.5-avx2",
                  "ref-3.7-neon", "ref-3.7-sse2", "ref-3.7-sse4.1", "ref-3.7-avx2",
-                 "ref-4.4-neon", "ref-4.4-sse2", "ref-4.4-sse4.1", "ref-4.4-avx2",
                  "ref-4.5-neon", "ref-4.5-sse2", "ref-4.5-sse4.1", "ref-4.5-avx2",
-                 "ref-main-neon", "ref-main-sse2", "ref-main-sse4.1", "ref-main-avx2"]
+                 "ref-main-neon", "ref-main-sve_256", "ref-main-sse2", "ref-main-sse4.1", "ref-main-avx2"]
 
     # All test encoders
-    testcoders = ["none", "neon", "sse2", "sse4.1", "avx2", "native", "universal"]
-    testcodersAArch64 = ["neon"]
+    testcoders = ["none", "neon", "sve_256", "sse2", "sse4.1", "avx2", "native", "universal"]
+    testcodersAArch64 = ["neon", "sve_256"]
     testcodersX86 = ["sse2", "sse4.1", "avx2"]
 
     coders = refcoders + testcoders + ["all-aarch64", "all-x86"]
diff --git a/Test/astc_update_ref.sh b/Test/astc_update_ref.sh
index bf6c32006..827d089e8 100755
--- a/Test/astc_update_ref.sh
+++ b/Test/astc_update_ref.sh
@@ -17,6 +17,6 @@ echo ""
 
 TARGET_ROOT=${1}
 
-python3 ./Test/astc_test_image.py --test-set all --block-size all --test-quality all --repeats 6 --encoder ref-$1-avx2
+python3 ./Test/astc_test_image.py --test-set all --block-size all --test-quality all --repeats 6 --encoder ref-$1-neon
 #python3 ./Test/astc_test_image.py --test-set all --block-size all --test-quality all --repeats 6 --encoder ref-$1-sse4.1
 #python3 ./Test/astc_test_image.py --test-set all --block-size all --test-quality all --repeats 6 --encoder ref-$1-sse2