From e369e21c112a5b6153ba47ef20cb0fd0ad8db92d Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Sat, 23 Nov 2024 20:16:42 +0000
Subject: [PATCH 1/4] Prototype linear HDR error metrics.

Prior to this change the HDR code path computed color error,
weight error, and final encoding error, by simply using the LNS
encoded data as linear data values.
---
 Source/astcenc_compress_symbolic.cpp   |  3 ++-
 Source/astcenc_decompress_symbolic.cpp | 17 +++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/Source/astcenc_compress_symbolic.cpp b/Source/astcenc_compress_symbolic.cpp
index 98d249512..eb4fc9172 100644
--- a/Source/astcenc_compress_symbolic.cpp
+++ b/Source/astcenc_compress_symbolic.cpp
@@ -368,8 +368,9 @@ static float compress_symbolic_block_for_partition_1plane(
 
 	int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
 
+	bool is_hdr = (config.profile == ASTCENC_PRF_HDR) || (config.profile == ASTCENC_PRF_HDR_RGB_LDR_A);
 	auto compute_difference = &compute_symbolic_block_difference_1plane;
-	if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM))
+	if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM) && !is_hdr)
 	{
 		compute_difference = &compute_symbolic_block_difference_1plane_1partition;
 	}
diff --git a/Source/astcenc_decompress_symbolic.cpp b/Source/astcenc_decompress_symbolic.cpp
index e7791eef6..583ff3144 100644
--- a/Source/astcenc_decompress_symbolic.cpp
+++ b/Source/astcenc_decompress_symbolic.cpp
@@ -354,6 +354,7 @@ float compute_symbolic_block_difference_2plane(
 	                       ep0, ep1);
 
 	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
+	vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);
 
 	// Unpack and compute error for each texel in the partition
 	unsigned int texel_count = bsd.texel_count;
@@ -362,8 +363,15 @@ float compute_symbolic_block_difference_2plane(
 		vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
 		vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight);
 
+		#if 0
 		vfloat4 color = int_to_float(colori);
 		vfloat4 oldColor = blk.texel(i);
+		#else
+		// TODO: Hack to force linear HDR RGB image error analysis
+		vfloat4 color = decode_texel(colori, lns_mask);
+		vfloat4 oldColor = float16_to_float(lns_to_sf16(float_to_int(blk.texel(i))));
+		oldColor.set_lane<3>(1.0f);
+		#endif
 
 		// Compare error using a perceptual decode metric for RGBM textures
 		if (config.flags & ASTCENC_FLG_MAP_RGBM)
@@ -451,6 +459,8 @@ float compute_symbolic_block_difference_1plane(
 		                       rgb_lns, a_lns,
 		                       ep0, ep1);
 
+		vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);
+
 		// Unpack and compute error for each texel in the partition
 		unsigned int texel_count = pi.partition_texel_count[i];
 		for (unsigned int j = 0; j < texel_count; j++)
@@ -459,8 +469,15 @@ float compute_symbolic_block_difference_1plane(
 			vint4 colori = lerp_color_int(u8_mask, ep0, ep1,
 			                              vint4(plane1_weights[tix]));
 
+			#if 0
 			vfloat4 color = int_to_float(colori);
 			vfloat4 oldColor = blk.texel(tix);
+			#else
+			// TODO: Hack to force linear HDR RGB image error analysis
+			vfloat4 color = decode_texel(colori, lns_mask);
+			vfloat4 oldColor = float16_to_float(lns_to_sf16(float_to_int(blk.texel(tix))));
+			oldColor.set_lane<3>(1.0f);
+			#endif
 
 			// Compare error using a perceptual decode metric for RGBM textures
 			if (config.flags & ASTCENC_FLG_MAP_RGBM)

From 2e59e2a06be4b9532714644681f95fd40dbda907 Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Sat, 23 Nov 2024 22:58:29 +0000
Subject: [PATCH 2/4] Use relative sum of squares error for HDR textures

HDR texture values are stored logarithmically.

Using absolute sum of squares on the logarithmic values causes the
compressor to spend too much effort preserving imperceptible shifts in
dark channel values at the expense of bright values in the same block.
This performs poorly in blocks with sharp luminance changes (dark
texels) and in blocks with saturated color values (dark channels in
bright pixels).

Using absolute sum of squares on linearized HDR values avoids the
compressor fixating on dark values, but instead causes the compressor to
spend too much effort preserving bright values. This is because the
errors in the bright channels can be orders of magnitude bigger than
the errors in the dark channels, and dark values can end up quantizing
close to black.

Using relative sum of square on the logarithmic values, proposed by Ryg
in the blog below, encourages the compressor to find a balance of
relative error across the whole block, favoring neither light nor
dark channels.

https://fgiesen.wordpress.com/2024/11/14/mrsse/
---
 Source/astcenc_compress_symbolic.cpp   |  4 +++
 Source/astcenc_decompress_symbolic.cpp | 46 ++++++++++++++------------
 2 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/Source/astcenc_compress_symbolic.cpp b/Source/astcenc_compress_symbolic.cpp
index eb4fc9172..946caa275 100644
--- a/Source/astcenc_compress_symbolic.cpp
+++ b/Source/astcenc_compress_symbolic.cpp
@@ -633,6 +633,7 @@ static float compress_symbolic_block_for_partition_1plane(
 
 				if (errorval < best_errorval_in_scb)
 				{
+					trace_add_data("select", "1");
 					best_errorval_in_scb = errorval;
 					workscb.errorval = errorval;
 					scb = workscb;
@@ -681,6 +682,7 @@ static float compress_symbolic_block_for_partition_1plane(
 
 			if (errorval < best_errorval_in_scb)
 			{
+				trace_add_data("select", "1");
 				best_errorval_in_scb = errorval;
 				workscb.errorval = errorval;
 				scb = workscb;
@@ -967,6 +969,7 @@ static float compress_symbolic_block_for_partition_2planes(
 
 				if (errorval < best_errorval_in_scb)
 				{
+					trace_add_data("select", "1");
 					best_errorval_in_scb = errorval;
 					workscb.errorval = errorval;
 					scb = workscb;
@@ -1016,6 +1019,7 @@ static float compress_symbolic_block_for_partition_2planes(
 
 			if (errorval < best_errorval_in_scb)
 			{
+				trace_add_data("select", "1");
 				best_errorval_in_scb = errorval;
 				workscb.errorval = errorval;
 				scb = workscb;
diff --git a/Source/astcenc_decompress_symbolic.cpp b/Source/astcenc_decompress_symbolic.cpp
index 583ff3144..f7d739cf6 100644
--- a/Source/astcenc_decompress_symbolic.cpp
+++ b/Source/astcenc_decompress_symbolic.cpp
@@ -363,15 +363,8 @@ float compute_symbolic_block_difference_2plane(
 		vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
 		vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight);
 
-		#if 0
 		vfloat4 color = int_to_float(colori);
 		vfloat4 oldColor = blk.texel(i);
-		#else
-		// TODO: Hack to force linear HDR RGB image error analysis
-		vfloat4 color = decode_texel(colori, lns_mask);
-		vfloat4 oldColor = float16_to_float(lns_to_sf16(float_to_int(blk.texel(i))));
-		oldColor.set_lane<3>(1.0f);
-		#endif
 
 		// Compare error using a perceptual decode metric for RGBM textures
 		if (config.flags & ASTCENC_FLG_MAP_RGBM)
@@ -403,11 +396,19 @@ float compute_symbolic_block_difference_2plane(
 			);
 		}
 
-		vfloat4 error = oldColor - color;
-		error = min(abs(error), 1e15f);
-		error = error * error;
+		// Compute sum of squared errors, weighted by channel weight
+		vfloat4 error = (oldColor - color);
+		error = dot(error, error * blk.channel_weight);
 
-		summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
+		// Convert this relative sum of squared error for HDR to avoid light
+		// channels dominating the error calculations.
+		// See https://fgiesen.wordpress.com/2024/11/14/mrsse/
+		if (any(lns_mask))
+		{
+			error = error / (dot(oldColor, oldColor) + 1e-10f);
+		}
+
+		summa += min(error, ERROR_CALC_DEFAULT);
 	}
 
 	return summa.lane<0>();
@@ -469,15 +470,8 @@ float compute_symbolic_block_difference_1plane(
 			vint4 colori = lerp_color_int(u8_mask, ep0, ep1,
 			                              vint4(plane1_weights[tix]));
 
-			#if 0
 			vfloat4 color = int_to_float(colori);
 			vfloat4 oldColor = blk.texel(tix);
-			#else
-			// TODO: Hack to force linear HDR RGB image error analysis
-			vfloat4 color = decode_texel(colori, lns_mask);
-			vfloat4 oldColor = float16_to_float(lns_to_sf16(float_to_int(blk.texel(tix))));
-			oldColor.set_lane<3>(1.0f);
-			#endif
 
 			// Compare error using a perceptual decode metric for RGBM textures
 			if (config.flags & ASTCENC_FLG_MAP_RGBM)
@@ -509,11 +503,19 @@ float compute_symbolic_block_difference_1plane(
 				);
 			}
 
-			vfloat4 error = oldColor - color;
-			error = min(abs(error), 1e15f);
-			error = error * error;
+			// Compute sum of squared errors, weighted by channel weight
+			vfloat4 error = (oldColor - color);
+			error = dot(error, error * blk.channel_weight);
+
+			// Convert this relative sum of squared error for HDR to avoid light
+			// channels dominating the error calculations
+			// See https://fgiesen.wordpress.com/2024/11/14/mrsse/
+			if (any(lns_mask))
+			{
+				error = error / (dot(oldColor, oldColor) + 1e-10f);
+			}
 
-			summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
+			summa += min(error, ERROR_CALC_DEFAULT);
 		}
 	}
 

From 1a3d5580091911be88b7954424943370f43fb4dc Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Sat, 23 Nov 2024 23:25:38 +0000
Subject: [PATCH 3/4] Remove trace points

---
 Source/astcenc_compress_symbolic.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/Source/astcenc_compress_symbolic.cpp b/Source/astcenc_compress_symbolic.cpp
index 946caa275..eb4fc9172 100644
--- a/Source/astcenc_compress_symbolic.cpp
+++ b/Source/astcenc_compress_symbolic.cpp
@@ -633,7 +633,6 @@ static float compress_symbolic_block_for_partition_1plane(
 
 				if (errorval < best_errorval_in_scb)
 				{
-					trace_add_data("select", "1");
 					best_errorval_in_scb = errorval;
 					workscb.errorval = errorval;
 					scb = workscb;
@@ -682,7 +681,6 @@ static float compress_symbolic_block_for_partition_1plane(
 
 			if (errorval < best_errorval_in_scb)
 			{
-				trace_add_data("select", "1");
 				best_errorval_in_scb = errorval;
 				workscb.errorval = errorval;
 				scb = workscb;
@@ -969,7 +967,6 @@ static float compress_symbolic_block_for_partition_2planes(
 
 				if (errorval < best_errorval_in_scb)
 				{
-					trace_add_data("select", "1");
 					best_errorval_in_scb = errorval;
 					workscb.errorval = errorval;
 					scb = workscb;
@@ -1019,7 +1016,6 @@ static float compress_symbolic_block_for_partition_2planes(
 
 			if (errorval < best_errorval_in_scb)
 			{
-				trace_add_data("select", "1");
 				best_errorval_in_scb = errorval;
 				workscb.errorval = errorval;
 				scb = workscb;

From f959f7624ee18fc4b7dfd1e4df22f1bfbe15a978 Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Sat, 23 Nov 2024 23:28:41 +0000
Subject: [PATCH 4/4] Use scalar test not any(vmask)

---
 Source/astcenc_decompress_symbolic.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/Source/astcenc_decompress_symbolic.cpp b/Source/astcenc_decompress_symbolic.cpp
index f7d739cf6..91e6b444c 100644
--- a/Source/astcenc_decompress_symbolic.cpp
+++ b/Source/astcenc_decompress_symbolic.cpp
@@ -354,7 +354,7 @@ float compute_symbolic_block_difference_2plane(
 	                       ep0, ep1);
 
 	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
-	vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);
+	bool any_lns = rgb_lns || a_lns;
 
 	// Unpack and compute error for each texel in the partition
 	unsigned int texel_count = bsd.texel_count;
@@ -403,8 +403,9 @@ float compute_symbolic_block_difference_2plane(
 		// Convert this relative sum of squared error for HDR to avoid light
 		// channels dominating the error calculations.
 		// See https://fgiesen.wordpress.com/2024/11/14/mrsse/
-		if (any(lns_mask))
+		if (any_lns)
 		{
+			// TODO: Divisor could be precomputed at load time
 			error = error / (dot(oldColor, oldColor) + 1e-10f);
 		}
 
@@ -460,7 +461,7 @@ float compute_symbolic_block_difference_1plane(
 		                       rgb_lns, a_lns,
 		                       ep0, ep1);
 
-		vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);
+		bool any_lns = rgb_lns || a_lns;
 
 		// Unpack and compute error for each texel in the partition
 		unsigned int texel_count = pi.partition_texel_count[i];
@@ -510,8 +511,9 @@ float compute_symbolic_block_difference_1plane(
 			// Convert this relative sum of squared error for HDR to avoid light
 			// channels dominating the error calculations
 			// See https://fgiesen.wordpress.com/2024/11/14/mrsse/
-			if (any(lns_mask))
+			if (any_lns)
 			{
+				// TODO: Divisor could be precomputed at load time
 				error = error / (dot(oldColor, oldColor) + 1e-10f);
 			}