Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,9 @@ public String toString() {
}

static class UUIDLiteral extends ComparableLiteral<UUID> {
private static final Comparator<UUID> CMP =
Comparators.<UUID>nullsFirst().thenComparing(Comparators.uuids());

UUIDLiteral(UUID value) {
super(value);
}
Expand All @@ -622,6 +625,11 @@ public <T> Literal<T> to(Type type) {
return null;
}

@Override
public Comparator<UUID> comparator() {
return CMP;
}

@Override
protected Type.TypeID typeId() {
return Type.TypeID.UUID;
Expand Down
43 changes: 42 additions & 1 deletion api/src/main/java/org/apache/iceberg/types/Comparators.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.function.IntFunction;
import org.apache.iceberg.StructLike;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
Expand All @@ -46,7 +47,7 @@ private Comparators() {}
.put(Types.TimestampNanoType.withZone(), Comparator.naturalOrder())
.put(Types.TimestampNanoType.withoutZone(), Comparator.naturalOrder())
.put(Types.StringType.get(), Comparators.charSequences())
.put(Types.UUIDType.get(), Comparator.naturalOrder())
.put(Types.UUIDType.get(), Comparators.uuids())
.put(Types.BinaryType.get(), Comparators.unsignedBytes())
.buildOrThrow();

Expand Down Expand Up @@ -232,6 +233,10 @@ public static Comparator<CharSequence> filePath() {
return FilePathComparator.INSTANCE;
}

public static Comparator<UUID> uuids() {
return UUIDComparator.INSTANCE;
}

private static class NullsFirst<T> implements Comparator<T> {
private static final NullsFirst<?> INSTANCE = new NullsFirst<>();

Expand Down Expand Up @@ -447,4 +452,40 @@ public int compare(CharSequence s1, CharSequence s2) {
return 0;
}
}

/**
* Compares UUIDs using unsigned byte-wise comparison using big-endian byte-order compliant with
* RFC 4122 and RFC 9562. Java's UUID.compareTo() compares most significant bits first, then least
* significant bits using signed value comparisons, which is a <a
* href="https://bugs.openjdk.org/browse/JDK-7025832">known bug</a>.
*/
private static class UUIDComparator implements Comparator<UUID> {
private static final UUIDComparator INSTANCE = new UUIDComparator();

private UUIDComparator() {}

@Override
public int compare(UUID uuid1, UUID uuid2) {
if (uuid1 == uuid2) {
return 0;
}

// Compare most significant bits first (bytes 0-7 in big-endian representation)
long msb1 = uuid1.getMostSignificantBits();
long msb2 = uuid2.getMostSignificantBits();

// Use unsigned comparison for the most significant bits
int msbCompare = Long.compareUnsigned(msb1, msb2);
if (msbCompare != 0) {
return msbCompare;
}

// If most significant bits are equal, compare least significant bits (bytes 8-15)
long lsb1 = uuid1.getLeastSignificantBits();
long lsb2 = uuid2.getLeastSignificantBits();

// Use unsigned comparison for the least significant bits
return Long.compareUnsigned(lsb1, lsb2);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import static org.assertj.core.api.Assertions.assertThatThrownBy;

import java.nio.ByteBuffer;
import java.util.UUID;
import org.apache.iceberg.ManifestFile;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
Expand All @@ -66,7 +67,8 @@ public class TestInclusiveManifestEvaluator {
optional(12, "no_nan_or_null", Types.DoubleType.get()),
optional(13, "all_nulls_missing_nan_float", Types.FloatType.get()),
optional(14, "all_same_value_or_null", Types.StringType.get()),
optional(15, "no_nulls_same_value_a", Types.StringType.get()));
optional(15, "no_nulls_same_value_a", Types.StringType.get()),
optional(16, "uuid", Types.UUIDType.get()));

private static final PartitionSpec SPEC =
PartitionSpec.builderFor(SCHEMA)
Expand All @@ -84,6 +86,7 @@ public class TestInclusiveManifestEvaluator {
.identity("all_nulls_missing_nan_float")
.identity("all_same_value_or_null")
.identity("no_nulls_same_value_a")
.identity("uuid")
.build();

private static final int INT_MIN_VALUE = 30;
Expand All @@ -95,6 +98,18 @@ public class TestInclusiveManifestEvaluator {
private static final ByteBuffer STRING_MIN = toByteBuffer(Types.StringType.get(), "a");
private static final ByteBuffer STRING_MAX = toByteBuffer(Types.StringType.get(), "z");

// UUID_MIN has all zeros in MSB, all ones in LSB: 00000000-0000-0000-ffff-ffffffffffff
// UUID_MAX has all ones in MSB, all zeros in LSB: ffffffff-ffff-ffff-0000-000000000000
// With unsigned byte-wise comparison (correct): UUID_MIN < UUID_MAX (0x00... < 0xFF...)
// With Java's natural order (incorrect): UUID_MIN > UUID_MAX (MSB 0 > MSB -1 as signed long)
private static final UUID UUID_MIN_VALUE =
UUID.fromString("00000000-0000-0000-ffff-ffffffffffff");
private static final UUID UUID_MAX_VALUE =
UUID.fromString("ffffffff-ffff-ffff-0000-000000000000");

private static final ByteBuffer UUID_MIN = toByteBuffer(Types.UUIDType.get(), UUID_MIN_VALUE);
private static final ByteBuffer UUID_MAX = toByteBuffer(Types.UUIDType.get(), UUID_MAX_VALUE);

private static final ManifestFile NO_STATS =
new TestHelpers.TestManifestFile(
"manifest-list.avro", 1024, 0, System.currentTimeMillis(), null, null, null, null, null);
Expand Down Expand Up @@ -128,7 +143,8 @@ public class TestInclusiveManifestEvaluator {
toByteBuffer(Types.FloatType.get(), 20F)),
new TestHelpers.TestFieldSummary(true, null, null),
new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MIN),
new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN)),
new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN),
new TestHelpers.TestFieldSummary(false, UUID_MIN, UUID_MAX)),
null);

@Test
Expand Down Expand Up @@ -753,4 +769,167 @@ public void testIntegerNotIn() {
ManifestEvaluator.forRowFilter(notIn("no_nulls", "abc", "def"), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should read: notIn on no nulls column").isTrue();
}

/** Tests UUID equality filter using byte-order comparison against partition bounds. */
@Test
public void testUuidEq() {
UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
boolean shouldRead =
ManifestEvaluator.forRowFilter(equal("uuid", belowMin), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should not read: uuid below lower bound").isFalse();

shouldRead =
ManifestEvaluator.forRowFilter(equal("uuid", UUID_MIN_VALUE), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should read: uuid equal to lower bound").isTrue();

UUID middle = UUID.fromString("7fffffff-ffff-ffff-7fff-ffffffffffff");
shouldRead = ManifestEvaluator.forRowFilter(equal("uuid", middle), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue();

shouldRead =
ManifestEvaluator.forRowFilter(equal("uuid", UUID_MAX_VALUE), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should read: uuid equal to upper bound").isTrue();

UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
shouldRead = ManifestEvaluator.forRowFilter(equal("uuid", aboveMax), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should not read: uuid above upper bound").isFalse();
}

/** Tests UUID less-than filter using byte-order comparison against partition bounds. */
@Test
public void testUuidLt() {
UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
boolean shouldRead =
ManifestEvaluator.forRowFilter(lessThan("uuid", belowMin), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should not read: uuid range below lower bound").isFalse();

shouldRead =
ManifestEvaluator.forRowFilter(lessThan("uuid", UUID_MIN_VALUE), SPEC, true).eval(FILE);
assertThat(shouldRead)
.as("Should not read: uuid range below lower bound (UUID_MIN is not < UUID_MIN)")
.isFalse();

UUID justAboveMin = UUID.fromString("00000000-0000-0001-0000-000000000000");
shouldRead =
ManifestEvaluator.forRowFilter(lessThan("uuid", justAboveMin), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should read: one possible uuid").isTrue();

shouldRead =
ManifestEvaluator.forRowFilter(lessThan("uuid", UUID_MAX_VALUE), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue();

UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
shouldRead = ManifestEvaluator.forRowFilter(lessThan("uuid", aboveMax), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should read: all uuids in range").isTrue();
}

/** Tests UUID less-than-or-equal filter using byte-order comparison against partition bounds. */
@Test
public void testUuidLtEq() {
UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
boolean shouldRead =
ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", belowMin), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should not read: uuid range below lower bound").isFalse();

shouldRead =
ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", UUID_MIN_VALUE), SPEC, true)
.eval(FILE);
assertThat(shouldRead).as("Should read: one possible uuid").isTrue();

shouldRead =
ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", UUID_MAX_VALUE), SPEC, true)
.eval(FILE);
assertThat(shouldRead).as("Should read: all uuids in range").isTrue();

UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
shouldRead =
ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", aboveMax), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should read: all uuids in range").isTrue();
}

/** Tests UUID greater-than filter using byte-order comparison against partition bounds. */
@Test
public void testUuidGt() {
UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
boolean shouldRead =
ManifestEvaluator.forRowFilter(greaterThan("uuid", belowMin), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should read: all uuids in range").isTrue();

shouldRead =
ManifestEvaluator.forRowFilter(greaterThan("uuid", UUID_MIN_VALUE), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue();

UUID justBelowMax = UUID.fromString("ffffffff-ffff-fffe-ffff-ffffffffffff");
shouldRead =
ManifestEvaluator.forRowFilter(greaterThan("uuid", justBelowMax), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should read: one possible uuid").isTrue();

shouldRead =
ManifestEvaluator.forRowFilter(greaterThan("uuid", UUID_MAX_VALUE), SPEC, true).eval(FILE);
assertThat(shouldRead)
.as("Should not read: uuid range above upper bound (UUID_MAX is not > UUID_MAX)")
.isFalse();

UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
shouldRead =
ManifestEvaluator.forRowFilter(greaterThan("uuid", aboveMax), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should not read: uuid range above upper bound").isFalse();
}

/**
* Tests UUID greater-than-or-equal filter using byte-order comparison against partition bounds.
*/
@Test
public void testUuidGtEq() {
UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
boolean shouldRead =
ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", belowMin), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should read: all uuids in range").isTrue();

shouldRead =
ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", UUID_MIN_VALUE), SPEC, true)
.eval(FILE);
assertThat(shouldRead).as("Should read: all uuids in range").isTrue();

shouldRead =
ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", UUID_MAX_VALUE), SPEC, true)
.eval(FILE);
assertThat(shouldRead).as("Should read: one possible uuid").isTrue();

UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
shouldRead =
ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", aboveMax), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should not read: uuid range above upper bound").isFalse();
}

/** Tests UUID IN filter using byte-order comparison against partition bounds. */
@Test
public void testUuidIn() {
UUID belowMin1 = UUID.fromString("00000000-0000-0000-0000-000000000000");
UUID belowMin2 = UUID.fromString("00000000-0000-0000-0000-000000000001");
boolean shouldRead =
ManifestEvaluator.forRowFilter(in("uuid", belowMin1, belowMin2), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should not read: uuids below lower bound").isFalse();

shouldRead =
ManifestEvaluator.forRowFilter(in("uuid", belowMin1, UUID_MIN_VALUE), SPEC, true)
.eval(FILE);
assertThat(shouldRead).as("Should read: uuid equal to lower bound").isTrue();

UUID middle1 = UUID.fromString("7fffffff-ffff-ffff-0000-000000000000");
UUID middle2 = UUID.fromString("7fffffff-ffff-ffff-ffff-ffffffffffff");
shouldRead =
ManifestEvaluator.forRowFilter(in("uuid", middle1, middle2), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should read: uuids between lower and upper bounds").isTrue();

UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
shouldRead =
ManifestEvaluator.forRowFilter(in("uuid", UUID_MAX_VALUE, aboveMax), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should read: uuid equal to upper bound").isTrue();

UUID aboveMax2 = UUID.fromString("ffffffff-ffff-ffff-ffff-fffffffffffe");
shouldRead =
ManifestEvaluator.forRowFilter(in("uuid", aboveMax, aboveMax2), SPEC, true).eval(FILE);
assertThat(shouldRead).as("Should not read: uuids above upper bound").isFalse();
}
}
Loading