diff --git a/api/src/main/java/org/apache/iceberg/expressions/Literals.java b/api/src/main/java/org/apache/iceberg/expressions/Literals.java index 3a45eb804f35..b5a1cbd2f917 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Literals.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Literals.java @@ -609,6 +609,9 @@ public String toString() { } static class UUIDLiteral extends ComparableLiteral { + private static final Comparator CMP = + Comparators.nullsFirst().thenComparing(Comparators.uuids()); + UUIDLiteral(UUID value) { super(value); } @@ -622,6 +625,11 @@ public Literal to(Type type) { return null; } + @Override + public Comparator comparator() { + return CMP; + } + @Override protected Type.TypeID typeId() { return Type.TypeID.UUID; diff --git a/api/src/main/java/org/apache/iceberg/types/Comparators.java b/api/src/main/java/org/apache/iceberg/types/Comparators.java index 32168d9a0904..eb5ec21361bc 100644 --- a/api/src/main/java/org/apache/iceberg/types/Comparators.java +++ b/api/src/main/java/org/apache/iceberg/types/Comparators.java @@ -22,6 +22,7 @@ import java.util.Comparator; import java.util.List; import java.util.Map; +import java.util.UUID; import java.util.function.IntFunction; import org.apache.iceberg.StructLike; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; @@ -46,7 +47,7 @@ private Comparators() {} .put(Types.TimestampNanoType.withZone(), Comparator.naturalOrder()) .put(Types.TimestampNanoType.withoutZone(), Comparator.naturalOrder()) .put(Types.StringType.get(), Comparators.charSequences()) - .put(Types.UUIDType.get(), Comparator.naturalOrder()) + .put(Types.UUIDType.get(), Comparators.uuids()) .put(Types.BinaryType.get(), Comparators.unsignedBytes()) .buildOrThrow(); @@ -232,6 +233,10 @@ public static Comparator filePath() { return FilePathComparator.INSTANCE; } + public static Comparator uuids() { + return UUIDComparator.INSTANCE; + } + private static class NullsFirst implements Comparator { private static final NullsFirst INSTANCE = new NullsFirst<>(); @@ -447,4 +452,40 @@ public int compare(CharSequence s1, CharSequence s2) { return 0; } } + + /** + * Compares UUIDs using unsigned byte-wise comparison using big-endian byte-order compliant with + * RFC 4122 and RFC 9562. Java's UUID.compareTo() compares most significant bits first, then least + * significant bits using signed value comparisons, which is a known bug. + */ + private static class UUIDComparator implements Comparator { + private static final UUIDComparator INSTANCE = new UUIDComparator(); + + private UUIDComparator() {} + + @Override + public int compare(UUID uuid1, UUID uuid2) { + if (uuid1 == uuid2) { + return 0; + } + + // Compare most significant bits first (bytes 0-7 in big-endian representation) + long msb1 = uuid1.getMostSignificantBits(); + long msb2 = uuid2.getMostSignificantBits(); + + // Use unsigned comparison for the most significant bits + int msbCompare = Long.compareUnsigned(msb1, msb2); + if (msbCompare != 0) { + return msbCompare; + } + + // If most significant bits are equal, compare least significant bits (bytes 8-15) + long lsb1 = uuid1.getLeastSignificantBits(); + long lsb2 = uuid2.getLeastSignificantBits(); + + // Use unsigned comparison for the least significant bits + return Long.compareUnsigned(lsb1, lsb2); + } + } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java index 068c862e2bda..341968a096b4 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java @@ -42,6 +42,7 @@ import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.nio.ByteBuffer; +import java.util.UUID; import org.apache.iceberg.ManifestFile; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; @@ -66,7 +67,8 @@ public class TestInclusiveManifestEvaluator { optional(12, "no_nan_or_null", Types.DoubleType.get()), optional(13, "all_nulls_missing_nan_float", Types.FloatType.get()), optional(14, "all_same_value_or_null", Types.StringType.get()), - optional(15, "no_nulls_same_value_a", Types.StringType.get())); + optional(15, "no_nulls_same_value_a", Types.StringType.get()), + optional(16, "uuid", Types.UUIDType.get())); private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) @@ -84,6 +86,7 @@ public class TestInclusiveManifestEvaluator { .identity("all_nulls_missing_nan_float") .identity("all_same_value_or_null") .identity("no_nulls_same_value_a") + .identity("uuid") .build(); private static final int INT_MIN_VALUE = 30; @@ -95,6 +98,18 @@ public class TestInclusiveManifestEvaluator { private static final ByteBuffer STRING_MIN = toByteBuffer(Types.StringType.get(), "a"); private static final ByteBuffer STRING_MAX = toByteBuffer(Types.StringType.get(), "z"); + // UUID_MIN has all zeros in MSB, all ones in LSB: 00000000-0000-0000-ffff-ffffffffffff + // UUID_MAX has all ones in MSB, all zeros in LSB: ffffffff-ffff-ffff-0000-000000000000 + // With unsigned byte-wise comparison (correct): UUID_MIN < UUID_MAX (0x00... < 0xFF...) + // With Java's natural order (incorrect): UUID_MIN > UUID_MAX (MSB 0 > MSB -1 as signed long) + private static final UUID UUID_MIN_VALUE = + UUID.fromString("00000000-0000-0000-ffff-ffffffffffff"); + private static final UUID UUID_MAX_VALUE = + UUID.fromString("ffffffff-ffff-ffff-0000-000000000000"); + + private static final ByteBuffer UUID_MIN = toByteBuffer(Types.UUIDType.get(), UUID_MIN_VALUE); + private static final ByteBuffer UUID_MAX = toByteBuffer(Types.UUIDType.get(), UUID_MAX_VALUE); + private static final ManifestFile NO_STATS = new TestHelpers.TestManifestFile( "manifest-list.avro", 1024, 0, System.currentTimeMillis(), null, null, null, null, null); @@ -128,7 +143,8 @@ public class TestInclusiveManifestEvaluator { toByteBuffer(Types.FloatType.get(), 20F)), new TestHelpers.TestFieldSummary(true, null, null), new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MIN), - new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN)), + new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN), + new TestHelpers.TestFieldSummary(false, UUID_MIN, UUID_MAX)), null); @Test @@ -753,4 +769,167 @@ public void testIntegerNotIn() { ManifestEvaluator.forRowFilter(notIn("no_nulls", "abc", "def"), SPEC, true).eval(FILE); assertThat(shouldRead).as("Should read: notIn on no nulls column").isTrue(); } + + /** Tests UUID equality filter using byte-order comparison against partition bounds. */ + @Test + public void testUuidEq() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(equal("uuid", belowMin), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid below lower bound").isFalse(); + + shouldRead = + ManifestEvaluator.forRowFilter(equal("uuid", UUID_MIN_VALUE), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to lower bound").isTrue(); + + UUID middle = UUID.fromString("7fffffff-ffff-ffff-7fff-ffffffffffff"); + shouldRead = ManifestEvaluator.forRowFilter(equal("uuid", middle), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(equal("uuid", UUID_MAX_VALUE), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to upper bound").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = ManifestEvaluator.forRowFilter(equal("uuid", aboveMax), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid above upper bound").isFalse(); + } + + /** Tests UUID less-than filter using byte-order comparison against partition bounds. */ + @Test + public void testUuidLt() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(lessThan("uuid", belowMin), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid range below lower bound").isFalse(); + + shouldRead = + ManifestEvaluator.forRowFilter(lessThan("uuid", UUID_MIN_VALUE), SPEC, true).eval(FILE); + assertThat(shouldRead) + .as("Should not read: uuid range below lower bound (UUID_MIN is not < UUID_MIN)") + .isFalse(); + + UUID justAboveMin = UUID.fromString("00000000-0000-0001-0000-000000000000"); + shouldRead = + ManifestEvaluator.forRowFilter(lessThan("uuid", justAboveMin), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(lessThan("uuid", UUID_MAX_VALUE), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = ManifestEvaluator.forRowFilter(lessThan("uuid", aboveMax), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + } + + /** Tests UUID less-than-or-equal filter using byte-order comparison against partition bounds. */ + @Test + public void testUuidLtEq() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", belowMin), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid range below lower bound").isFalse(); + + shouldRead = + ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", UUID_MIN_VALUE), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", UUID_MAX_VALUE), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", aboveMax), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + } + + /** Tests UUID greater-than filter using byte-order comparison against partition bounds. */ + @Test + public void testUuidGt() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("uuid", belowMin), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("uuid", UUID_MIN_VALUE), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue(); + + UUID justBelowMax = UUID.fromString("ffffffff-ffff-fffe-ffff-ffffffffffff"); + shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("uuid", justBelowMax), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("uuid", UUID_MAX_VALUE), SPEC, true).eval(FILE); + assertThat(shouldRead) + .as("Should not read: uuid range above upper bound (UUID_MAX is not > UUID_MAX)") + .isFalse(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("uuid", aboveMax), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid range above upper bound").isFalse(); + } + + /** + * Tests UUID greater-than-or-equal filter using byte-order comparison against partition bounds. + */ + @Test + public void testUuidGtEq() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", belowMin), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", UUID_MIN_VALUE), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", UUID_MAX_VALUE), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", aboveMax), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid range above upper bound").isFalse(); + } + + /** Tests UUID IN filter using byte-order comparison against partition bounds. */ + @Test + public void testUuidIn() { + UUID belowMin1 = UUID.fromString("00000000-0000-0000-0000-000000000000"); + UUID belowMin2 = UUID.fromString("00000000-0000-0000-0000-000000000001"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(in("uuid", belowMin1, belowMin2), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should not read: uuids below lower bound").isFalse(); + + shouldRead = + ManifestEvaluator.forRowFilter(in("uuid", belowMin1, UUID_MIN_VALUE), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to lower bound").isTrue(); + + UUID middle1 = UUID.fromString("7fffffff-ffff-ffff-0000-000000000000"); + UUID middle2 = UUID.fromString("7fffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + ManifestEvaluator.forRowFilter(in("uuid", middle1, middle2), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuids between lower and upper bounds").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + ManifestEvaluator.forRowFilter(in("uuid", UUID_MAX_VALUE, aboveMax), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to upper bound").isTrue(); + + UUID aboveMax2 = UUID.fromString("ffffffff-ffff-ffff-ffff-fffffffffffe"); + shouldRead = + ManifestEvaluator.forRowFilter(in("uuid", aboveMax, aboveMax2), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should not read: uuids above upper bound").isFalse(); + } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java index 2f4fbf395739..7279295c8172 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java @@ -42,6 +42,7 @@ import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.util.List; +import java.util.UUID; import org.apache.iceberg.DataFile; import org.apache.iceberg.Schema; import org.apache.iceberg.TestHelpers.Row; @@ -71,7 +72,8 @@ public class TestInclusiveMetricsEvaluator { optional(11, "all_nans_v1_stats", Types.FloatType.get()), optional(12, "nan_and_null_only", Types.DoubleType.get()), optional(13, "no_nan_stats", Types.DoubleType.get()), - optional(14, "some_empty", Types.StringType.get())); + optional(14, "some_empty", Types.StringType.get()), + optional(15, "uuid", Types.UUIDType.get())); private static final Schema NESTED_SCHEMA = new Schema( @@ -91,6 +93,16 @@ public class TestInclusiveMetricsEvaluator { private static final int INT_MIN_VALUE = 30; private static final int INT_MAX_VALUE = 79; + // UUIDs that demonstrate the difference between Java's natural order and byte-order comparison + // UUID_MIN has all zeros in MSB, all ones in LSB: 00000000-0000-0000-ffff-ffffffffffff + // UUID_MAX has all ones in MSB, all zeros in LSB: ffffffff-ffff-ffff-0000-000000000000 + // With byte-order comparison (correct): UUID_MIN < UUID_MAX (0x00... < 0xFF...) + // With Java's natural order (incorrect): UUID_MIN > UUID_MAX (MSB 0 > MSB -1 as signed long) + private static final UUID UUID_MIN_VALUE = + UUID.fromString("00000000-0000-0000-ffff-ffffffffffff"); + private static final UUID UUID_MAX_VALUE = + UUID.fromString("ffffffff-ffff-ffff-0000-000000000000"); + private static final DataFile FILE = new TestDataFile( "file.avro", @@ -109,6 +121,7 @@ public class TestInclusiveMetricsEvaluator { .put(12, 50L) .put(13, 50L) .put(14, 50L) + .put(15, 50L) .buildOrThrow(), // null value counts ImmutableMap.builder() @@ -119,6 +132,7 @@ public class TestInclusiveMetricsEvaluator { .put(11, 0L) .put(12, 1L) .put(14, 0L) + .put(15, 0L) .buildOrThrow(), // nan value counts ImmutableMap.of( @@ -126,17 +140,21 @@ public class TestInclusiveMetricsEvaluator { 8, 10L, 9, 0L), // lower bounds - ImmutableMap.of( - 1, toByteBuffer(IntegerType.get(), INT_MIN_VALUE), - 11, toByteBuffer(Types.FloatType.get(), Float.NaN), - 12, toByteBuffer(Types.DoubleType.get(), Double.NaN), - 14, toByteBuffer(Types.StringType.get(), "")), + ImmutableMap.builder() + .put(1, toByteBuffer(IntegerType.get(), INT_MIN_VALUE)) + .put(11, toByteBuffer(Types.FloatType.get(), Float.NaN)) + .put(12, toByteBuffer(Types.DoubleType.get(), Double.NaN)) + .put(14, toByteBuffer(Types.StringType.get(), "")) + .put(15, toByteBuffer(Types.UUIDType.get(), UUID_MIN_VALUE)) + .buildOrThrow(), // upper bounds - ImmutableMap.of( - 1, toByteBuffer(IntegerType.get(), INT_MAX_VALUE), - 11, toByteBuffer(Types.FloatType.get(), Float.NaN), - 12, toByteBuffer(Types.DoubleType.get(), Double.NaN), - 14, toByteBuffer(Types.StringType.get(), "房东整租霍营小区二层两居室"))); + ImmutableMap.builder() + .put(1, toByteBuffer(IntegerType.get(), INT_MAX_VALUE)) + .put(11, toByteBuffer(Types.FloatType.get(), Float.NaN)) + .put(12, toByteBuffer(Types.DoubleType.get(), Double.NaN)) + .put(14, toByteBuffer(Types.StringType.get(), "房东整租霍营小区二层两居室")) + .put(15, toByteBuffer(Types.UUIDType.get(), UUID_MAX_VALUE)) + .buildOrThrow()); private static final DataFile FILE_2 = new TestDataFile( @@ -970,4 +988,198 @@ public void testNotNullInNestedStruct() { .as("Should not read: optional_address.optional_street2 is optional") .isFalse(); } + + /** Tests UUID equality filter using byte-order comparison against data file metrics. */ + @Test + public void testUuidEq() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", belowMin)).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid below lower bound").isFalse(); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", UUID_MIN_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to lower bound").isTrue(); + + UUID middle = UUID.fromString("7fffffff-ffff-ffff-7fff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", middle)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue(); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", UUID_MAX_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to upper bound").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", aboveMax)).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid above upper bound").isFalse(); + } + + /** Tests UUID less-than filter using byte-order comparison against data file metrics. */ + @Test + public void testUuidLt() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", belowMin)).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid range below lower bound").isFalse(); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", UUID_MIN_VALUE)).eval(FILE); + assertThat(shouldRead) + .as("Should not read: uuid range below lower bound (UUID_MIN is not < UUID_MIN)") + .isFalse(); + + UUID justAboveMin = UUID.fromString("00000000-0000-0001-0000-000000000000"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", justAboveMin)).eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", UUID_MAX_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", aboveMax)).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + } + + /** Tests UUID less-than-or-equal filter using byte-order comparison against data file metrics. */ + @Test + public void testUuidLtEq() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("uuid", belowMin)).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid range below lower bound").isFalse(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("uuid", UUID_MIN_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("uuid", UUID_MAX_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("uuid", aboveMax)).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + } + + /** Tests UUID greater-than filter using byte-order comparison against data file metrics. */ + @Test + public void testUuidGt() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", belowMin)).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", UUID_MIN_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue(); + + UUID justBelowMax = UUID.fromString("ffffffff-ffff-fffe-ffff-ffffffffffff"); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", justBelowMax)).eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", UUID_MAX_VALUE)).eval(FILE); + assertThat(shouldRead) + .as("Should not read: uuid range above upper bound (UUID_MAX is not > UUID_MAX)") + .isFalse(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", aboveMax)).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid range above upper bound").isFalse(); + } + + /** + * Tests UUID greater-than-or-equal filter using byte-order comparison against data file metrics. + */ + @Test + public void testUuidGtEq() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("uuid", belowMin)).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("uuid", UUID_MIN_VALUE)) + .eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("uuid", UUID_MAX_VALUE)) + .eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("uuid", aboveMax)).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid range above upper bound").isFalse(); + } + + /** Tests UUID IN filter using byte-order comparison against data file metrics. */ + @Test + public void testUuidIn() { + UUID belowMin1 = UUID.fromString("00000000-0000-0000-0000-000000000000"); + UUID belowMin2 = UUID.fromString("00000000-0000-0000-0000-000000000001"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, in("uuid", belowMin1, belowMin2)).eval(FILE); + assertThat(shouldRead).as("Should not read: uuids below lower bound").isFalse(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, in("uuid", belowMin1, UUID_MIN_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to lower bound").isTrue(); + + UUID middle1 = UUID.fromString("7fffffff-ffff-ffff-0000-000000000000"); + UUID middle2 = UUID.fromString("7fffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, in("uuid", middle1, middle2)).eval(FILE); + assertThat(shouldRead).as("Should read: uuids between lower and upper bounds").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, in("uuid", UUID_MAX_VALUE, aboveMax)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to upper bound").isTrue(); + + UUID aboveMax2 = UUID.fromString("ffffffff-ffff-ffff-ffff-fffffffffffe"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, in("uuid", aboveMax, aboveMax2)).eval(FILE); + assertThat(shouldRead).as("Should not read: uuids above upper bound").isFalse(); + } + + /** Tests UUID not-equal filter against data file metrics. */ + @Test + public void testUuidNotEq() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notEqual("uuid", belowMin)).eval(FILE); + assertThat(shouldRead).as("Should read: notEqual always reads").isTrue(); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("uuid", UUID_MIN_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: notEqual always reads").isTrue(); + + UUID middle = UUID.fromString("7fffffff-ffff-ffff-7fff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("uuid", middle)).eval(FILE); + assertThat(shouldRead).as("Should read: notEqual always reads").isTrue(); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("uuid", UUID_MAX_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: notEqual always reads").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("uuid", aboveMax)).eval(FILE); + assertThat(shouldRead).as("Should read: notEqual always reads").isTrue(); + } + + /** Tests UUID NOT IN filter against data file metrics. */ + @Test + public void testUuidNotIn() { + UUID belowMin1 = UUID.fromString("00000000-0000-0000-0000-000000000000"); + UUID belowMin2 = UUID.fromString("00000000-0000-0000-0000-000000000001"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notIn("uuid", belowMin1, belowMin2)).eval(FILE); + assertThat(shouldRead).as("Should read: notIn always reads").isTrue(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notIn("uuid", UUID_MIN_VALUE, UUID_MAX_VALUE)) + .eval(FILE); + assertThat(shouldRead).as("Should read: notIn always reads").isTrue(); + + UUID middle1 = UUID.fromString("7fffffff-ffff-ffff-0000-000000000000"); + UUID middle2 = UUID.fromString("7fffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notIn("uuid", middle1, middle2)).eval(FILE); + assertThat(shouldRead).as("Should read: notIn always reads").isTrue(); + } } diff --git a/api/src/test/java/org/apache/iceberg/types/TestComparators.java b/api/src/test/java/org/apache/iceberg/types/TestComparators.java index 691e3f04a074..8a27bd84109d 100644 --- a/api/src/test/java/org/apache/iceberg/types/TestComparators.java +++ b/api/src/test/java/org/apache/iceberg/types/TestComparators.java @@ -100,6 +100,35 @@ public void testUuid() { Comparators.forType(Types.UUIDType.get()), UUID.fromString("81873e7d-1374-4493-8e1d-9095eff7046c"), UUID.fromString("fd02441d-1423-4a3f-8785-c7dd5647e26b")); + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("00000000-0000-0000-0000-000000000000"), + UUID.fromString("60000000-0000-0000-0000-000000000000")); + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("60000000-0000-0000-0000-000000000000"), + UUID.fromString("70000000-0000-0000-0000-000000000000")); + // The following assertion fails prior to the introduction of UUIDComparator. + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("70000000-0000-0000-0000-000000000000"), + UUID.fromString("80000000-0000-0000-0000-000000000000")); + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("80000000-0000-0000-0000-000000000000"), + UUID.fromString("90000000-0000-0000-0000-000000000000")); + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("90000000-0000-0000-0000-000000000000"), + UUID.fromString("a0000000-0000-0000-0000-000000000000")); + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("a0000000-0000-0000-0000-000000000000"), + UUID.fromString("f0000000-0000-0000-0000-000000000000")); + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("ffffffff-ffff-ffff-ffff-fffffffffffe"), + UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff")); } @Test diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java index b0d98b358b6d..0c3b60827b96 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java @@ -29,6 +29,7 @@ import org.apache.iceberg.SortKey; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Comparators; public class DataDistributionUtil { private DataDistributionUtil() {} @@ -162,7 +163,7 @@ public static UUID[] reservoirSampleUUIDs(int sampleSize, int reservoirSize) { } } - Arrays.sort(reservoir); + Arrays.sort(reservoir, Comparators.uuids()); return reservoir; } @@ -172,7 +173,7 @@ public static UUID[] rangeBoundSampleUUIDs(UUID[] sampledUUIDs, int rangeBoundSi for (int i = 0; i < rangeBoundSize; ++i) { rangeBounds[i] = sampledUUIDs[i * step]; } - Arrays.sort(rangeBounds); + Arrays.sort(rangeBounds, Comparators.uuids()); return rangeBounds; } } diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java index b0d98b358b6d..0c3b60827b96 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java @@ -29,6 +29,7 @@ import org.apache.iceberg.SortKey; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Comparators; public class DataDistributionUtil { private DataDistributionUtil() {} @@ -162,7 +163,7 @@ public static UUID[] reservoirSampleUUIDs(int sampleSize, int reservoirSize) { } } - Arrays.sort(reservoir); + Arrays.sort(reservoir, Comparators.uuids()); return reservoir; } @@ -172,7 +173,7 @@ public static UUID[] rangeBoundSampleUUIDs(UUID[] sampledUUIDs, int rangeBoundSi for (int i = 0; i < rangeBoundSize; ++i) { rangeBounds[i] = sampledUUIDs[i * step]; } - Arrays.sort(rangeBounds); + Arrays.sort(rangeBounds, Comparators.uuids()); return rangeBounds; } } diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java index b0d98b358b6d..0c3b60827b96 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java @@ -29,6 +29,7 @@ import org.apache.iceberg.SortKey; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Comparators; public class DataDistributionUtil { private DataDistributionUtil() {} @@ -162,7 +163,7 @@ public static UUID[] reservoirSampleUUIDs(int sampleSize, int reservoirSize) { } } - Arrays.sort(reservoir); + Arrays.sort(reservoir, Comparators.uuids()); return reservoir; } @@ -172,7 +173,7 @@ public static UUID[] rangeBoundSampleUUIDs(UUID[] sampledUUIDs, int rangeBoundSi for (int i = 0; i < rangeBoundSize; ++i) { rangeBounds[i] = sampledUUIDs[i * step]; } - Arrays.sort(rangeBounds); + Arrays.sort(rangeBounds, Comparators.uuids()); return rangeBounds; } }