From 37d4fc15ff88c9d4fbad155ffe5d10c258fe6a6a Mon Sep 17 00:00:00 2001 From: bodduv Date: Tue, 4 Nov 2025 11:24:39 +0100 Subject: [PATCH 01/11] API: Use unsigned byte-wise comparison to fix UUID comparison bug Java's UUID.compareTo() uses signed comparison of most significant bits and least significant bits, which is not compliant with RFC 4122, RFC 9562. This causes incorrect ManifestEntry and ManifestFile filtering/pruning in the presence of UUID filters. --- .../apache/iceberg/expressions/Literals.java | 8 + .../org/apache/iceberg/types/Comparators.java | 43 +++- .../TestInclusiveManifestEvaluator.java | 175 ++++++++++++++- .../TestInclusiveMetricsEvaluator.java | 206 +++++++++++++++++- .../apache/iceberg/types/TestComparators.java | 29 +++ .../sink/shuffle/DataDistributionUtil.java | 5 +- .../sink/shuffle/DataDistributionUtil.java | 5 +- .../sink/shuffle/DataDistributionUtil.java | 5 +- 8 files changed, 464 insertions(+), 12 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/Literals.java b/api/src/main/java/org/apache/iceberg/expressions/Literals.java index 3a45eb804f35..b5a1cbd2f917 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Literals.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Literals.java @@ -609,6 +609,9 @@ public String toString() { } static class UUIDLiteral extends ComparableLiteral { + private static final Comparator CMP = + Comparators.nullsFirst().thenComparing(Comparators.uuids()); + UUIDLiteral(UUID value) { super(value); } @@ -622,6 +625,11 @@ public Literal to(Type type) { return null; } + @Override + public Comparator comparator() { + return CMP; + } + @Override protected Type.TypeID typeId() { return Type.TypeID.UUID; diff --git a/api/src/main/java/org/apache/iceberg/types/Comparators.java b/api/src/main/java/org/apache/iceberg/types/Comparators.java index ab59c895686d..d99b66961fcf 100644 --- a/api/src/main/java/org/apache/iceberg/types/Comparators.java +++ b/api/src/main/java/org/apache/iceberg/types/Comparators.java @@ -22,6 +22,7 @@ import java.util.Comparator; import java.util.List; import java.util.Map; +import java.util.UUID; import java.util.function.IntFunction; import org.apache.iceberg.StructLike; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; @@ -46,7 +47,7 @@ private Comparators() {} .put(Types.TimestampNanoType.withZone(), Comparator.naturalOrder()) .put(Types.TimestampNanoType.withoutZone(), Comparator.naturalOrder()) .put(Types.StringType.get(), Comparators.charSequences()) - .put(Types.UUIDType.get(), Comparator.naturalOrder()) + .put(Types.UUIDType.get(), Comparators.uuids()) .put(Types.BinaryType.get(), Comparators.unsignedBytes()) .put(Types.UnknownType.get(), Comparator.nullsFirst(Comparator.naturalOrder())) .buildOrThrow(); @@ -233,6 +234,10 @@ public static Comparator filePath() { return FilePathComparator.INSTANCE; } + public static Comparator uuids() { + return UUIDComparator.INSTANCE; + } + private static class NullsFirst implements Comparator { private static final NullsFirst INSTANCE = new NullsFirst<>(); @@ -448,4 +453,40 @@ public int compare(CharSequence s1, CharSequence s2) { return 0; } } + + /** + * Compares UUIDs using unsigned byte-wise comparison using big-endian byte-order compliant with + * RFC 4122 and RFC 9562. Java's UUID.compareTo() compares most significant bits first, then least + * significant bits using signed value comparisons, which is a known bug. + */ + private static class UUIDComparator implements Comparator { + private static final UUIDComparator INSTANCE = new UUIDComparator(); + + private UUIDComparator() {} + + @Override + public int compare(UUID uuid1, UUID uuid2) { + if (uuid1 == uuid2) { + return 0; + } + + // Compare most significant bits first (bytes 0-7 in big-endian representation) + long msb1 = uuid1.getMostSignificantBits(); + long msb2 = uuid2.getMostSignificantBits(); + + // Use unsigned comparison for the most significant bits + int msbCompare = Long.compareUnsigned(msb1, msb2); + if (msbCompare != 0) { + return msbCompare; + } + + // If most significant bits are equal, compare least significant bits (bytes 8-15) + long lsb1 = uuid1.getLeastSignificantBits(); + long lsb2 = uuid2.getLeastSignificantBits(); + + // Use unsigned comparison for the least significant bits + return Long.compareUnsigned(lsb1, lsb2); + } + } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java index 78e6064eb427..1673bd7c2d75 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java @@ -42,6 +42,7 @@ import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.nio.ByteBuffer; +import java.util.UUID; import org.apache.iceberg.ManifestFile; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; @@ -69,7 +70,8 @@ public class TestInclusiveManifestEvaluator { optional(15, "no_nulls_same_value_a", Types.StringType.get()), optional(16, "single_value_with_nan", Types.FloatType.get()), optional(17, "single_value_nan_unknown", Types.FloatType.get()), - optional(18, "single_value_no_nan", Types.FloatType.get())); + optional(18, "single_value_no_nan", Types.FloatType.get()), + optional(19, "uuid", Types.UUIDType.get())); private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) @@ -90,6 +92,7 @@ public class TestInclusiveManifestEvaluator { .identity("single_value_with_nan") .identity("single_value_nan_unknown") .identity("single_value_no_nan") + .identity("uuid") .build(); private static final int INT_MIN_VALUE = 30; @@ -101,6 +104,18 @@ public class TestInclusiveManifestEvaluator { private static final ByteBuffer STRING_MIN = toByteBuffer(Types.StringType.get(), "a"); private static final ByteBuffer STRING_MAX = toByteBuffer(Types.StringType.get(), "z"); + // UUID_MIN has all zeros in MSB, all ones in LSB: 00000000-0000-0000-ffff-ffffffffffff + // UUID_MAX has all ones in MSB, all zeros in LSB: ffffffff-ffff-ffff-0000-000000000000 + // With unsigned byte-wise comparison (correct): UUID_MIN < UUID_MAX (0x00... < 0xFF...) + // With Java's natural order (incorrect): UUID_MIN > UUID_MAX (MSB 0 > MSB -1 as signed long) + private static final UUID UUID_MIN_VALUE = + UUID.fromString("00000000-0000-0000-ffff-ffffffffffff"); + private static final UUID UUID_MAX_VALUE = + UUID.fromString("ffffffff-ffff-ffff-0000-000000000000"); + + private static final ByteBuffer UUID_MIN = toByteBuffer(Types.UUIDType.get(), UUID_MIN_VALUE); + private static final ByteBuffer UUID_MAX = toByteBuffer(Types.UUIDType.get(), UUID_MAX_VALUE); + private static final ManifestFile NO_STATS = new TestHelpers.TestManifestFile( "manifest-list.avro", 1024, 0, System.currentTimeMillis(), null, null, null, null, null); @@ -148,7 +163,8 @@ public class TestInclusiveManifestEvaluator { false, false, toByteBuffer(Types.FloatType.get(), 5.0F), - toByteBuffer(Types.FloatType.get(), 5.0F))), + toByteBuffer(Types.FloatType.get(), 5.0F)), + new TestHelpers.TestFieldSummary(false, UUID_MIN, UUID_MAX)), null); @Test @@ -853,4 +869,159 @@ public void testNotInWithSingleValue() { .as("Should not read: manifest contains single float value with no NaNs") .isFalse(); } + + @Test + public void testUuidEq() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(equal("uuid", belowMin), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid below lower bound").isFalse(); + + shouldRead = + ManifestEvaluator.forRowFilter(equal("uuid", UUID_MIN_VALUE), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to lower bound").isTrue(); + + UUID between = UUID.fromString("7fffffff-ffff-ffff-7fff-ffffffffffff"); + shouldRead = ManifestEvaluator.forRowFilter(equal("uuid", between), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(equal("uuid", UUID_MAX_VALUE), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to upper bound").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = ManifestEvaluator.forRowFilter(equal("uuid", aboveMax), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid above upper bound").isFalse(); + } + + @Test + public void testUuidLt() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(lessThan("uuid", belowMin), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid range below lower bound").isFalse(); + + shouldRead = + ManifestEvaluator.forRowFilter(lessThan("uuid", UUID_MIN_VALUE), SPEC, true).eval(FILE); + assertThat(shouldRead) + .as("Should not read: uuid range below lower bound (UUID_MIN is not < UUID_MIN)") + .isFalse(); + + UUID justAboveMin = UUID.fromString("00000000-0000-0001-0000-000000000000"); + shouldRead = + ManifestEvaluator.forRowFilter(lessThan("uuid", justAboveMin), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(lessThan("uuid", UUID_MAX_VALUE), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = ManifestEvaluator.forRowFilter(lessThan("uuid", aboveMax), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + } + + @Test + public void testUuidLtEq() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", belowMin), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid range below lower bound").isFalse(); + + shouldRead = + ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", UUID_MIN_VALUE), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", UUID_MAX_VALUE), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", aboveMax), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + } + + @Test + public void testUuidGt() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("uuid", belowMin), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("uuid", UUID_MIN_VALUE), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue(); + + UUID justBelowMax = UUID.fromString("ffffffff-ffff-fffe-ffff-ffffffffffff"); + shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("uuid", justBelowMax), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("uuid", UUID_MAX_VALUE), SPEC, true).eval(FILE); + assertThat(shouldRead) + .as("Should not read: uuid range above upper bound (UUID_MAX is not > UUID_MAX)") + .isFalse(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("uuid", aboveMax), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid range above upper bound").isFalse(); + } + + @Test + public void testUuidGtEq() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", belowMin), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", UUID_MIN_VALUE), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", UUID_MAX_VALUE), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", aboveMax), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid range above upper bound").isFalse(); + } + + @Test + public void testUuidIn() { + UUID belowMin1 = UUID.fromString("00000000-0000-0000-0000-000000000000"); + UUID belowMin2 = UUID.fromString("00000000-0000-0000-0000-000000000001"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(in("uuid", belowMin1, belowMin2), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should not read: uuids below lower bound").isFalse(); + + shouldRead = + ManifestEvaluator.forRowFilter(in("uuid", belowMin1, UUID_MIN_VALUE), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to lower bound").isTrue(); + + UUID middle1 = UUID.fromString("7fffffff-ffff-ffff-0000-000000000000"); + UUID middle2 = UUID.fromString("7fffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + ManifestEvaluator.forRowFilter(in("uuid", middle1, middle2), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuids between lower and upper bounds").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + ManifestEvaluator.forRowFilter(in("uuid", UUID_MAX_VALUE, aboveMax), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to upper bound").isTrue(); + + UUID aboveMax2 = UUID.fromString("ffffffff-ffff-ffff-ffff-fffffffffffe"); + shouldRead = + ManifestEvaluator.forRowFilter(in("uuid", aboveMax, aboveMax2), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should not read: uuids above upper bound").isFalse(); + } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java index 5f0ca2659fbf..a723746dadda 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java @@ -42,6 +42,7 @@ import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.util.List; +import java.util.UUID; import org.apache.iceberg.DataFile; import org.apache.iceberg.Schema; import org.apache.iceberg.TestHelpers.Row; @@ -71,7 +72,8 @@ public class TestInclusiveMetricsEvaluator { optional(11, "all_nans_v1_stats", Types.FloatType.get()), optional(12, "nan_and_null_only", Types.DoubleType.get()), optional(13, "no_nan_stats", Types.DoubleType.get()), - optional(14, "some_empty", Types.StringType.get())); + optional(14, "some_empty", Types.StringType.get()), + optional(15, "uuid", Types.UUIDType.get())); private static final Schema NESTED_SCHEMA = new Schema( @@ -91,6 +93,16 @@ public class TestInclusiveMetricsEvaluator { private static final int INT_MIN_VALUE = 30; private static final int INT_MAX_VALUE = 79; + // UUIDs that demonstrate the difference between Java's natural order and byte-order comparison + // UUID_MIN has all zeros in MSB, all ones in LSB: 00000000-0000-0000-ffff-ffffffffffff + // UUID_MAX has all ones in MSB, all zeros in LSB: ffffffff-ffff-ffff-0000-000000000000 + // With byte-order comparison (correct): UUID_MIN < UUID_MAX (0x00... < 0xFF...) + // With Java's natural order (incorrect): UUID_MIN > UUID_MAX (MSB 0 > MSB -1 as signed long) + private static final UUID UUID_MIN_VALUE = + UUID.fromString("00000000-0000-0000-ffff-ffffffffffff"); + private static final UUID UUID_MAX_VALUE = + UUID.fromString("ffffffff-ffff-ffff-0000-000000000000"); + private static final DataFile FILE = new TestDataFile( "file.avro", @@ -109,6 +121,7 @@ public class TestInclusiveMetricsEvaluator { .put(12, 50L) .put(13, 50L) .put(14, 50L) + .put(15, 50L) .buildOrThrow(), // null value counts ImmutableMap.builder() @@ -119,6 +132,7 @@ public class TestInclusiveMetricsEvaluator { .put(11, 0L) .put(12, 1L) .put(14, 0L) + .put(15, 0L) .buildOrThrow(), // nan value counts ImmutableMap.of( @@ -130,13 +144,15 @@ public class TestInclusiveMetricsEvaluator { 1, toByteBuffer(IntegerType.get(), INT_MIN_VALUE), 11, toByteBuffer(Types.FloatType.get(), Float.NaN), 12, toByteBuffer(Types.DoubleType.get(), Double.NaN), - 14, toByteBuffer(Types.StringType.get(), "")), + 14, toByteBuffer(Types.StringType.get(), ""), + 15, toByteBuffer(Types.UUIDType.get(), UUID_MIN_VALUE)), // upper bounds ImmutableMap.of( 1, toByteBuffer(IntegerType.get(), INT_MAX_VALUE), 11, toByteBuffer(Types.FloatType.get(), Float.NaN), 12, toByteBuffer(Types.DoubleType.get(), Double.NaN), - 14, toByteBuffer(Types.StringType.get(), "房东整租霍营小区二层两居室"))); + 14, toByteBuffer(Types.StringType.get(), "房东整租霍营小区二层两居室"), + 15, toByteBuffer(Types.UUIDType.get(), UUID_MAX_VALUE))); private static final DataFile FILE_2 = new TestDataFile( @@ -1138,4 +1154,188 @@ public void testNotInWithSingleValue() { .as("Should read: file has NaN values which match NOT IN predicate") .isTrue(); } + + @Test + public void testUuidEq() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", belowMin)).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid below lower bound").isFalse(); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", UUID_MIN_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to lower bound").isTrue(); + + UUID between = UUID.fromString("7fffffff-ffff-ffff-7fff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", between)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue(); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", UUID_MAX_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to upper bound").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", aboveMax)).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid above upper bound").isFalse(); + } + + @Test + public void testUuidLt() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", belowMin)).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid range below lower bound").isFalse(); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", UUID_MIN_VALUE)).eval(FILE); + assertThat(shouldRead) + .as("Should not read: uuid range below lower bound (UUID_MIN is not < UUID_MIN)") + .isFalse(); + + UUID justAboveMin = UUID.fromString("00000000-0000-0001-0000-000000000000"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", justAboveMin)).eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", UUID_MAX_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", aboveMax)).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + } + + @Test + public void testUuidLtEq() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("uuid", belowMin)).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid range below lower bound").isFalse(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("uuid", UUID_MIN_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("uuid", UUID_MAX_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("uuid", aboveMax)).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + } + + @Test + public void testUuidGt() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", belowMin)).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", UUID_MIN_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue(); + + UUID justBelowMax = UUID.fromString("ffffffff-ffff-fffe-ffff-ffffffffffff"); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", justBelowMax)).eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", UUID_MAX_VALUE)).eval(FILE); + assertThat(shouldRead) + .as("Should not read: uuid range above upper bound (UUID_MAX is not > UUID_MAX)") + .isFalse(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", aboveMax)).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid range above upper bound").isFalse(); + } + + @Test + public void testUuidGtEq() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("uuid", belowMin)).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("uuid", UUID_MIN_VALUE)) + .eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("uuid", UUID_MAX_VALUE)) + .eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("uuid", aboveMax)).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid range above upper bound").isFalse(); + } + + @Test + public void testUuidIn() { + UUID belowMin1 = UUID.fromString("00000000-0000-0000-0000-000000000000"); + UUID belowMin2 = UUID.fromString("00000000-0000-0000-0000-000000000001"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, in("uuid", belowMin1, belowMin2)).eval(FILE); + assertThat(shouldRead).as("Should not read: uuids below lower bound").isFalse(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, in("uuid", belowMin1, UUID_MIN_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to lower bound").isTrue(); + + UUID middle1 = UUID.fromString("7fffffff-ffff-ffff-0000-000000000000"); + UUID middle2 = UUID.fromString("7fffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, in("uuid", middle1, middle2)).eval(FILE); + assertThat(shouldRead).as("Should read: uuids between lower and upper bounds").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, in("uuid", UUID_MAX_VALUE, aboveMax)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to upper bound").isTrue(); + + UUID aboveMax2 = UUID.fromString("ffffffff-ffff-ffff-ffff-fffffffffffe"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, in("uuid", aboveMax, aboveMax2)).eval(FILE); + assertThat(shouldRead).as("Should not read: uuids above upper bound").isFalse(); + } + + @Test + public void testUuidNotEq() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notEqual("uuid", belowMin)).eval(FILE); + assertThat(shouldRead).as("Should read: notEqual always reads").isTrue(); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("uuid", UUID_MIN_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: notEqual always reads").isTrue(); + + UUID middle = UUID.fromString("7fffffff-ffff-ffff-7fff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("uuid", middle)).eval(FILE); + assertThat(shouldRead).as("Should read: notEqual always reads").isTrue(); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("uuid", UUID_MAX_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: notEqual always reads").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("uuid", aboveMax)).eval(FILE); + assertThat(shouldRead).as("Should read: notEqual always reads").isTrue(); + } + + @Test + public void testUuidNotIn() { + UUID belowMin1 = UUID.fromString("00000000-0000-0000-0000-000000000000"); + UUID belowMin2 = UUID.fromString("00000000-0000-0000-0000-000000000001"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notIn("uuid", belowMin1, belowMin2)).eval(FILE); + assertThat(shouldRead).as("Should read: notIn always reads").isTrue(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notIn("uuid", UUID_MIN_VALUE, UUID_MAX_VALUE)) + .eval(FILE); + assertThat(shouldRead).as("Should read: notIn always reads").isTrue(); + + UUID middle1 = UUID.fromString("7fffffff-ffff-ffff-0000-000000000000"); + UUID middle2 = UUID.fromString("7fffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notIn("uuid", middle1, middle2)).eval(FILE); + assertThat(shouldRead).as("Should read: notIn always reads").isTrue(); + } } diff --git a/api/src/test/java/org/apache/iceberg/types/TestComparators.java b/api/src/test/java/org/apache/iceberg/types/TestComparators.java index 691e3f04a074..8a27bd84109d 100644 --- a/api/src/test/java/org/apache/iceberg/types/TestComparators.java +++ b/api/src/test/java/org/apache/iceberg/types/TestComparators.java @@ -100,6 +100,35 @@ public void testUuid() { Comparators.forType(Types.UUIDType.get()), UUID.fromString("81873e7d-1374-4493-8e1d-9095eff7046c"), UUID.fromString("fd02441d-1423-4a3f-8785-c7dd5647e26b")); + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("00000000-0000-0000-0000-000000000000"), + UUID.fromString("60000000-0000-0000-0000-000000000000")); + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("60000000-0000-0000-0000-000000000000"), + UUID.fromString("70000000-0000-0000-0000-000000000000")); + // The following assertion fails prior to the introduction of UUIDComparator. + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("70000000-0000-0000-0000-000000000000"), + UUID.fromString("80000000-0000-0000-0000-000000000000")); + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("80000000-0000-0000-0000-000000000000"), + UUID.fromString("90000000-0000-0000-0000-000000000000")); + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("90000000-0000-0000-0000-000000000000"), + UUID.fromString("a0000000-0000-0000-0000-000000000000")); + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("a0000000-0000-0000-0000-000000000000"), + UUID.fromString("f0000000-0000-0000-0000-000000000000")); + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("ffffffff-ffff-ffff-ffff-fffffffffffe"), + UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff")); } @Test diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java index b0d98b358b6d..0c3b60827b96 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java @@ -29,6 +29,7 @@ import org.apache.iceberg.SortKey; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Comparators; public class DataDistributionUtil { private DataDistributionUtil() {} @@ -162,7 +163,7 @@ public static UUID[] reservoirSampleUUIDs(int sampleSize, int reservoirSize) { } } - Arrays.sort(reservoir); + Arrays.sort(reservoir, Comparators.uuids()); return reservoir; } @@ -172,7 +173,7 @@ public static UUID[] rangeBoundSampleUUIDs(UUID[] sampledUUIDs, int rangeBoundSi for (int i = 0; i < rangeBoundSize; ++i) { rangeBounds[i] = sampledUUIDs[i * step]; } - Arrays.sort(rangeBounds); + Arrays.sort(rangeBounds, Comparators.uuids()); return rangeBounds; } } diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java index b0d98b358b6d..0c3b60827b96 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java @@ -29,6 +29,7 @@ import org.apache.iceberg.SortKey; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Comparators; public class DataDistributionUtil { private DataDistributionUtil() {} @@ -162,7 +163,7 @@ public static UUID[] reservoirSampleUUIDs(int sampleSize, int reservoirSize) { } } - Arrays.sort(reservoir); + Arrays.sort(reservoir, Comparators.uuids()); return reservoir; } @@ -172,7 +173,7 @@ public static UUID[] rangeBoundSampleUUIDs(UUID[] sampledUUIDs, int rangeBoundSi for (int i = 0; i < rangeBoundSize; ++i) { rangeBounds[i] = sampledUUIDs[i * step]; } - Arrays.sort(rangeBounds); + Arrays.sort(rangeBounds, Comparators.uuids()); return rangeBounds; } } diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java index b0d98b358b6d..0c3b60827b96 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java @@ -29,6 +29,7 @@ import org.apache.iceberg.SortKey; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Comparators; public class DataDistributionUtil { private DataDistributionUtil() {} @@ -162,7 +163,7 @@ public static UUID[] reservoirSampleUUIDs(int sampleSize, int reservoirSize) { } } - Arrays.sort(reservoir); + Arrays.sort(reservoir, Comparators.uuids()); return reservoir; } @@ -172,7 +173,7 @@ public static UUID[] rangeBoundSampleUUIDs(UUID[] sampledUUIDs, int rangeBoundSi for (int i = 0; i < rangeBoundSize; ++i) { rangeBounds[i] = sampledUUIDs[i * step]; } - Arrays.sort(rangeBounds); + Arrays.sort(rangeBounds, Comparators.uuids()); return rangeBounds; } } From 15be0e333b385385bae5cd16457688f521ce7896 Mon Sep 17 00:00:00 2001 From: bodduv Date: Wed, 14 Jan 2026 10:16:28 +0100 Subject: [PATCH 02/11] Let ManifestFile and ManifestEntry to be read if filter expression passes either RFC or non-RFC compliant UUID comparator --- .../iceberg/expressions/ExpressionUtil.java | 197 ++++++++++++++++++ .../InclusiveMetricsEvaluator.java | 62 +++++- .../apache/iceberg/expressions/Literals.java | 29 ++- .../expressions/ManifestEvaluator.java | 63 +++++- .../org/apache/iceberg/types/Comparators.java | 13 ++ .../expressions/TestExpressionUtil.java | 121 +++++++++++ .../TestInclusiveManifestEvaluator.java | 170 ++++++++++++++- .../TestInclusiveMetricsEvaluator.java | 142 ++++++++++++- .../TestStrictMetricsEvaluator.java | 154 +++++++++++++- .../data/TestMetricsRowGroupFilter.java | 144 +++++++++++++ .../parquet/ParquetMetricsRowGroupFilter.java | 68 +++++- 11 files changed, 1120 insertions(+), 43 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java index 9bb2b713439d..1f0ab15f4fcd 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java @@ -24,6 +24,7 @@ import java.time.temporal.ChronoUnit; import java.util.List; import java.util.Locale; +import java.util.UUID; import java.util.concurrent.TimeUnit; import java.util.function.Function; import java.util.regex.Pattern; @@ -744,4 +745,200 @@ private static PartitionSpec identitySpec(Schema schema, int... ids) { return specBuilder.build(); } + + /** + * Checks if the given bound expression contains any UUID predicates that compare against min/max + * bounds. These predicates may produce incorrect results when evaluating against statistics + * written with a different UUID comparator. + * + * @param expr a bound expression + * @return true if the expression contains UUID predicates that compare against bounds + */ + public static boolean hasUUIDBoundsPredicate(Expression expr) { + return ExpressionVisitors.visit(expr, new UUIDBoundsPredicateDetector()); + } + + /** + * Transforms an unbound expression to use the signed UUID comparator in all UUID literals. This + * is used for backward compatibility with files written before RFC 4122/9562 compliant comparison + * was implemented. + * + * @param expr an unbound expression + * @return a new expression with UUID literals using the signed comparator + */ + public static Expression withSignedUUIDComparator(Expression expr) { + return ExpressionVisitors.visit(expr, new SignedUUIDLiteralTransformer()); + } + + /** + * Visitor that detects if an expression contains UUID predicates that compare against bounds. + * These include: lt, ltEq, gt, gtEq, eq, notEq, in, notIn on UUID columns. + */ + private static class UUIDBoundsPredicateDetector + extends ExpressionVisitors.ExpressionVisitor { + + @Override + public Boolean alwaysTrue() { + return false; + } + + @Override + public Boolean alwaysFalse() { + return false; + } + + @Override + public Boolean not(Boolean result) { + return result; + } + + @Override + public Boolean and(Boolean leftResult, Boolean rightResult) { + return leftResult || rightResult; + } + + @Override + public Boolean or(Boolean leftResult, Boolean rightResult) { + return leftResult || rightResult; + } + + @Override + public Boolean predicate(BoundPredicate pred) { + if (pred.term() instanceof BoundReference) { + BoundReference ref = (BoundReference) pred.term(); + if (ref.type().typeId() == Type.TypeID.UUID) { + switch (pred.op()) { + case LT: + case LT_EQ: + case GT: + case GT_EQ: + case EQ: + case NOT_EQ: + case IN: + case NOT_IN: + return true; + default: + return false; + } + } + } + return false; + } + + @Override + public Boolean predicate(UnboundPredicate pred) { + // For unbound predicates, we can check if the literal is a UUID + if (pred.literal() != null && pred.literal().value() instanceof UUID) { + switch (pred.op()) { + case LT: + case LT_EQ: + case GT: + case GT_EQ: + case EQ: + case NOT_EQ: + case IN: + case NOT_IN: + return true; + default: + return false; + } + } + // Check for IN/NOT_IN with UUID literals + if (pred.literals() != null && !pred.literals().isEmpty()) { + Literal first = pred.literals().get(0); + if (first.value() instanceof UUID) { + return true; + } + } + return false; + } + } + + /** + * Visitor that transforms an expression to use the signed UUID comparator in all UUID literals. + */ + private static class SignedUUIDLiteralTransformer + extends ExpressionVisitors.ExpressionVisitor { + + @Override + public Expression alwaysTrue() { + return Expressions.alwaysTrue(); + } + + @Override + public Expression alwaysFalse() { + return Expressions.alwaysFalse(); + } + + @Override + public Expression not(Expression result) { + return Expressions.not(result); + } + + @Override + public Expression and(Expression leftResult, Expression rightResult) { + return Expressions.and(leftResult, rightResult); + } + + @Override + public Expression or(Expression leftResult, Expression rightResult) { + return Expressions.or(leftResult, rightResult); + } + + @Override + public Expression predicate(BoundPredicate pred) { + // Bound predicates should not be transformed - this is for unbound expressions + throw new UnsupportedOperationException( + "Cannot transform bound predicate; use unbound expressions"); + } + + @Override + @SuppressWarnings("unchecked") + public Expression predicate(UnboundPredicate pred) { + UnboundTerm term = pred.term(); + + switch (pred.op()) { + case IS_NULL: + case NOT_NULL: + case IS_NAN: + case NOT_NAN: + // Unary predicates don't have literals to transform + return pred; + + case LT: + case LT_EQ: + case GT: + case GT_EQ: + case EQ: + case NOT_EQ: + case STARTS_WITH: + case NOT_STARTS_WITH: + Literal lit = pred.literal(); + if (lit.value() instanceof UUID) { + Literals.UUIDLiteral uuidLit = (Literals.UUIDLiteral) lit; + return new UnboundPredicate<>(pred.op(), term, (T) uuidLit.withSignedComparator()); + } + return pred; + + case IN: + case NOT_IN: + List> literals = pred.literals(); + if (!literals.isEmpty() && literals.get(0).value() instanceof UUID) { + List transformedValues = + literals.stream() + .map( + l -> { + Literals.UUIDLiteral uuidLit = (Literals.UUIDLiteral) l; + return (T) uuidLit.withSignedComparator(); + }) + .collect(Collectors.toList()); + return new UnboundPredicate<>(pred.op(), term, transformedValues); + } + return pred; + + default: + return pred; + } + } + } } diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java index 81cbbe785519..6b6cc0b96560 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java @@ -32,6 +32,7 @@ import org.apache.iceberg.transforms.Transform; import org.apache.iceberg.types.Comparators; import org.apache.iceberg.types.Conversions; +import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.util.NaNUtil; import org.apache.iceberg.variants.Variant; @@ -56,6 +57,9 @@ public class InclusiveMetricsEvaluator { private static final int IN_PREDICATE_LIMIT = 200; private final Expression expr; + // Expression using signed UUID comparator for backward compatibility with files written before + // RFC-compliant UUID comparisons were introduced. Null if no UUID predicates. + private final Expression signedUuidExpr; public InclusiveMetricsEvaluator(Schema schema, Expression unbound) { this(schema, unbound, true); @@ -63,7 +67,17 @@ public InclusiveMetricsEvaluator(Schema schema, Expression unbound) { public InclusiveMetricsEvaluator(Schema schema, Expression unbound, boolean caseSensitive) { StructType struct = schema.asStruct(); - this.expr = Binder.bind(struct, rewriteNot(unbound), caseSensitive); + Expression rewritten = rewriteNot(unbound); + this.expr = Binder.bind(struct, rewritten, caseSensitive); + + // Only create the signed UUID expression if there are UUID predicates that compare against + // bounds + if (ExpressionUtil.hasUUIDBoundsPredicate(this.expr)) { + Expression signedRewritten = ExpressionUtil.withSignedUUIDComparator(rewritten); + this.signedUuidExpr = Binder.bind(struct, signedRewritten, caseSensitive); + } else { + this.signedUuidExpr = null; + } } /** @@ -74,7 +88,20 @@ public InclusiveMetricsEvaluator(Schema schema, Expression unbound, boolean case */ public boolean eval(ContentFile file) { // TODO: detect the case where a column is missing from the file using file's max field id. - return new MetricsEvalVisitor().eval(file); + boolean result = new MetricsEvalVisitor().eval(file, expr, false); + + // If the RFC-compliant evaluation says rows might match, or there's no signed UUID expression, + // return the result. + if (result || signedUuidExpr == null) { + return result; + } + + // Always try with signed UUID comparator as a fallback. There is no reliable way to detect + // which comparator was used when the file's column metrics were written. + // The signedUuidExpr has literals with signed comparators for lt/gt/eq predicates. + // For IN predicates, we pass signedUuidMode=true so comparatorForIn() returns signed + // comparator. + return new MetricsEvalVisitor().eval(file, signedUuidExpr, true); } private static final boolean ROWS_MIGHT_MATCH = true; @@ -86,8 +113,12 @@ private class MetricsEvalVisitor extends ExpressionVisitors.BoundVisitor nanCounts = null; private Map lowerBounds = null; private Map upperBounds = null; + // Flag to use signed UUID comparator for backward compatibility. + // This is needed for the IN predicate because the comparator information is lost + // when binding converts literals to a Set of raw values. + private boolean useSignedUuidComparator = false; - private boolean eval(ContentFile file) { + private boolean eval(ContentFile file, Expression expression, boolean signedUuidMode) { if (file.recordCount() == 0) { return ROWS_CANNOT_MATCH; } @@ -104,8 +135,22 @@ private boolean eval(ContentFile file) { this.nanCounts = file.nanValueCounts(); this.lowerBounds = file.lowerBounds(); this.upperBounds = file.upperBounds(); + this.useSignedUuidComparator = signedUuidMode; - return ExpressionVisitors.visitEvaluator(expr, this); + return ExpressionVisitors.visitEvaluator(expression, this); + } + + /** + * Returns the appropriate comparator for the given term. This is needed for the IN predicate + * because the comparator information is lost when binding converts literals to a Set of raw + * values. For other predicates, the literal carries the comparator. + */ + @SuppressWarnings("unchecked") + private Comparator comparatorForIn(Bound term) { + if (useSignedUuidComparator && term.ref().type().typeId() == Type.TypeID.UUID) { + return (Comparator) Comparators.signedUUIDs(); + } + return ((BoundTerm) term).comparator(); } @Override @@ -359,10 +404,9 @@ public Boolean in(Bound term, Set literalSet) { return ROWS_MIGHT_MATCH; } + Comparator cmp = comparatorForIn(term); literals = - literals.stream() - .filter(v -> ((BoundTerm) term).comparator().compare(lower, v) <= 0) - .collect(Collectors.toList()); + literals.stream().filter(v -> cmp.compare(lower, v) <= 0).collect(Collectors.toList()); // if all values are less than lower bound, rows cannot match if (literals.isEmpty()) { return ROWS_CANNOT_MATCH; @@ -374,9 +418,7 @@ public Boolean in(Bound term, Set literalSet) { } literals = - literals.stream() - .filter(v -> ((BoundTerm) term).comparator().compare(upper, v) >= 0) - .collect(Collectors.toList()); + literals.stream().filter(v -> cmp.compare(upper, v) >= 0).collect(Collectors.toList()); // if remaining values are greater than upper bound, rows cannot match if (literals.isEmpty()) { return ROWS_CANNOT_MATCH; diff --git a/api/src/main/java/org/apache/iceberg/expressions/Literals.java b/api/src/main/java/org/apache/iceberg/expressions/Literals.java index b5a1cbd2f917..40f664bfc0f4 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Literals.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Literals.java @@ -608,12 +608,24 @@ public String toString() { } } - static class UUIDLiteral extends ComparableLiteral { - private static final Comparator CMP = + static class UUIDLiteral extends BaseLiteral { + private static final Comparator RFC_CMP = Comparators.nullsFirst().thenComparing(Comparators.uuids()); + private static final Comparator SIGNED_CMP = + Comparators.nullsFirst().thenComparing(Comparators.signedUUIDs()); + + // Flag to indicate which comparator to use (serializable) + private final boolean useSignedComparator; + // Transient cached comparator (reconstructed on deserialization) + private transient volatile Comparator cmp; UUIDLiteral(UUID value) { + this(value, false); + } + + UUIDLiteral(UUID value, boolean useSignedComparator) { super(value); + this.useSignedComparator = useSignedComparator; } @Override @@ -627,13 +639,24 @@ public Literal to(Type type) { @Override public Comparator comparator() { - return CMP; + if (cmp == null) { + cmp = useSignedComparator ? SIGNED_CMP : RFC_CMP; + } + return cmp; } @Override protected Type.TypeID typeId() { return Type.TypeID.UUID; } + + /** + * Creates a new UUIDLiteral with the signed comparator for backward compatibility with files + * written before RFC-compliant UUID comparisons were introduced. + */ + UUIDLiteral withSignedComparator() { + return new UUIDLiteral(value(), true); + } } static class FixedLiteral extends BaseLiteral { diff --git a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java index 21762946ca33..867d765a71fa 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java @@ -34,6 +34,7 @@ import org.apache.iceberg.types.Comparators; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; import org.apache.iceberg.util.BinaryUtil; /** @@ -51,6 +52,9 @@ public class ManifestEvaluator { private static final int IN_PREDICATE_LIMIT = 200; private final Expression expr; + // Expression using signed UUID comparator for backward compatibility with files written before + // RFC-compliant UUID comparisons were introduced. Null if no UUID predicates. + private final Expression signedUuidExpr; public static ManifestEvaluator forRowFilter( Expression rowFilter, PartitionSpec spec, boolean caseSensitive) { @@ -64,7 +68,18 @@ public static ManifestEvaluator forPartitionFilter( } private ManifestEvaluator(PartitionSpec spec, Expression partitionFilter, boolean caseSensitive) { - this.expr = Binder.bind(spec.partitionType(), rewriteNot(partitionFilter), caseSensitive); + Types.StructType partitionType = spec.partitionType(); + Expression rewritten = rewriteNot(partitionFilter); + this.expr = Binder.bind(partitionType, rewritten, caseSensitive); + + // Only create the signed UUID expression if there are UUID predicates that compare against + // bounds + if (ExpressionUtil.hasUUIDBoundsPredicate(this.expr)) { + Expression signedRewritten = ExpressionUtil.withSignedUUIDComparator(rewritten); + this.signedUuidExpr = Binder.bind(partitionType, signedRewritten, caseSensitive); + } else { + this.signedUuidExpr = null; + } } /** @@ -74,7 +89,20 @@ private ManifestEvaluator(PartitionSpec spec, Expression partitionFilter, boolea * @return false if the file cannot contain rows that match the expression, true otherwise. */ public boolean eval(ManifestFile manifest) { - return new ManifestEvalVisitor().eval(manifest); + boolean result = new ManifestEvalVisitor().eval(manifest, expr, false); + + // If the RFC-compliant evaluation says rows might match, or there's no signed UUID expression, + // return the result. + if (result || signedUuidExpr == null) { + return result; + } + + // Always try with signed UUID comparator as a fallback. There is no reliable way to detect + // which comparator was used when the manifest's partition field summaries were written. + // The signedUuidExpr has literals with signed comparators for lt/gt/eq predicates. + // For IN predicates, we pass signedUuidMode=true so comparatorForIn() returns signed + // comparator. + return new ManifestEvalVisitor().eval(manifest, signedUuidExpr, true); } private static final boolean ROWS_MIGHT_MATCH = true; @@ -82,14 +110,32 @@ public boolean eval(ManifestFile manifest) { private class ManifestEvalVisitor extends BoundExpressionVisitor { private List stats = null; + // Flag to use signed UUID comparator for backward compatibility. + // This is needed for the IN predicate because the comparator information is lost + // when binding converts literals to a Set of raw values. + private boolean useSignedUuidComparator = false; - private boolean eval(ManifestFile manifest) { + private boolean eval(ManifestFile manifest, Expression expression, boolean signedUuidMode) { this.stats = manifest.partitions(); + this.useSignedUuidComparator = signedUuidMode; if (stats == null) { return ROWS_MIGHT_MATCH; } - return ExpressionVisitors.visitEvaluator(expr, this); + return ExpressionVisitors.visitEvaluator(expression, this); + } + + /** + * Returns the appropriate comparator for the given reference. This is needed for the IN + * predicate because the comparator information is lost when binding converts literals to a Set + * of raw values. For other predicates, the literal carries the comparator. + */ + @SuppressWarnings("unchecked") + private Comparator comparatorForIn(BoundReference ref) { + if (useSignedUuidComparator && ref.type().typeId() == Type.TypeID.UUID) { + return (Comparator) Comparators.signedUUIDs(); + } + return ref.comparator(); } @Override @@ -295,20 +341,17 @@ public Boolean in(BoundReference ref, Set literalSet) { return ROWS_MIGHT_MATCH; } + Comparator cmp = comparatorForIn(ref); T lower = Conversions.fromByteBuffer(ref.type(), fieldStats.lowerBound()); literals = - literals.stream() - .filter(v -> ref.comparator().compare(lower, v) <= 0) - .collect(Collectors.toList()); + literals.stream().filter(v -> cmp.compare(lower, v) <= 0).collect(Collectors.toList()); if (literals.isEmpty()) { // if all values are less than lower bound, rows cannot match. return ROWS_CANNOT_MATCH; } T upper = Conversions.fromByteBuffer(ref.type(), fieldStats.upperBound()); literals = - literals.stream() - .filter(v -> ref.comparator().compare(upper, v) >= 0) - .collect(Collectors.toList()); + literals.stream().filter(v -> cmp.compare(upper, v) >= 0).collect(Collectors.toList()); if (literals .isEmpty()) { // if all remaining values are greater than upper bound, rows cannot match. return ROWS_CANNOT_MATCH; diff --git a/api/src/main/java/org/apache/iceberg/types/Comparators.java b/api/src/main/java/org/apache/iceberg/types/Comparators.java index d99b66961fcf..79aa36165bd3 100644 --- a/api/src/main/java/org/apache/iceberg/types/Comparators.java +++ b/api/src/main/java/org/apache/iceberg/types/Comparators.java @@ -238,6 +238,19 @@ public static Comparator uuids() { return UUIDComparator.INSTANCE; } + /** + * Returns a comparator that uses Java's native signed UUID comparison. + * + *

This is provided for backward compatibility with older files that may have been written with + * min/max statistics computed using Java's signed {@link UUID#compareTo(UUID)}. New code should + * use {@link #uuids()} which provides RFC 4122/9562 compliant unsigned comparison. + * + * @return a comparator using signed UUID comparison + */ + public static Comparator signedUUIDs() { + return Comparator.naturalOrder(); + } + private static class NullsFirst implements Comparator { private static final NullsFirst INSTANCE = new NullsFirst<>(); diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java index d9fe26eacc6b..b3a195041d8e 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java @@ -29,6 +29,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.UUID; import java.util.regex.Pattern; import java.util.stream.IntStream; import org.apache.iceberg.PartitionSpec; @@ -1350,4 +1351,124 @@ private void assertEquals(UnboundTransform expected, UnboundTransform pred = (UnboundPredicate) transformed; + assertThat(pred.literal().value()).isEqualTo(testUuid); + // The literal should now use the signed comparator + assertThat(pred.literal()).isInstanceOf(Literals.UUIDLiteral.class); + } + + @Test + public void testWithSignedUUIDComparatorTransformsInPredicate() { + UUID uuid1 = UUID.randomUUID(); + UUID uuid2 = UUID.randomUUID(); + Expression original = Expressions.in("uuid_col", uuid1, uuid2); + Expression transformed = ExpressionUtil.withSignedUUIDComparator(original); + + assertThat(transformed).isInstanceOf(UnboundPredicate.class); + UnboundPredicate pred = (UnboundPredicate) transformed; + assertThat(pred.literals()).hasSize(2); + } + + @Test + public void testWithSignedUUIDComparatorPreservesNonUuidLiterals() { + Expression original = Expressions.equal("id", 42L); + Expression transformed = ExpressionUtil.withSignedUUIDComparator(original); + + // Should be unchanged + assertThat(transformed).isInstanceOf(UnboundPredicate.class); + UnboundPredicate pred = (UnboundPredicate) transformed; + assertThat(pred.literal().value()).isEqualTo(42L); + } + + @Test + public void testWithSignedUUIDComparatorTransformsCompoundExpression() { + UUID testUuid = UUID.randomUUID(); + Expression original = + Expressions.and( + Expressions.equal("id", 42L), Expressions.greaterThan("uuid_col", testUuid)); + Expression transformed = ExpressionUtil.withSignedUUIDComparator(original); + + assertThat(transformed).isInstanceOf(And.class); + And and = (And) transformed; + // Both children should be transformed + assertThat(and.left()).isInstanceOf(UnboundPredicate.class); + assertThat(and.right()).isInstanceOf(UnboundPredicate.class); + } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java index 1673bd7c2d75..3be7f60d4606 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java @@ -896,11 +896,18 @@ public void testUuidEq() { @Test public void testUuidLt() { + // With RFC comparison, belowMin is below the lower bound so no rows can be < belowMin. + // With signed comparison, UUID_MIN_VALUE.compareTo(belowMin) = -1 (lower < lit), + // so rows might match. We try both comparators and return true if either matches. UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); boolean shouldRead = ManifestEvaluator.forRowFilter(lessThan("uuid", belowMin), SPEC, true).eval(FILE); - assertThat(shouldRead).as("Should not read: uuid range below lower bound").isFalse(); + assertThat(shouldRead) + .as("Should read: signed UUID fallback finds matches with inverted bounds") + .isTrue(); + // UUID_MIN_VALUE is the lower bound, so no rows can be < UUID_MIN_VALUE. + // Both RFC and signed comparators agree on this since we're comparing the value to itself. shouldRead = ManifestEvaluator.forRowFilter(lessThan("uuid", UUID_MIN_VALUE), SPEC, true).eval(FILE); assertThat(shouldRead) @@ -923,10 +930,14 @@ public void testUuidLt() { @Test public void testUuidLtEq() { + // With RFC comparison, belowMin is below the lower bound so no rows can be <= belowMin. + // With signed comparison, the bounds are inverted, so rows might match. UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); boolean shouldRead = ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", belowMin), SPEC, true).eval(FILE); - assertThat(shouldRead).as("Should not read: uuid range below lower bound").isFalse(); + assertThat(shouldRead) + .as("Should read: signed UUID fallback finds matches with inverted bounds") + .isTrue(); shouldRead = ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", UUID_MIN_VALUE), SPEC, true) @@ -960,16 +971,23 @@ public void testUuidGt() { ManifestEvaluator.forRowFilter(greaterThan("uuid", justBelowMax), SPEC, true).eval(FILE); assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + // UUID_MAX_VALUE is the upper bound, so no rows can be > UUID_MAX_VALUE. + // Both RFC and signed comparators agree on this since we're comparing the value to itself. shouldRead = ManifestEvaluator.forRowFilter(greaterThan("uuid", UUID_MAX_VALUE), SPEC, true).eval(FILE); assertThat(shouldRead) .as("Should not read: uuid range above upper bound (UUID_MAX is not > UUID_MAX)") .isFalse(); + // With RFC comparison, aboveMax is above the upper bound so no rows can be > aboveMax. + // With signed comparison, UUID_MAX_VALUE.compareTo(aboveMax) = 1 (upper > lit), + // so rows might match. We try both comparators and return true if either matches. UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); shouldRead = ManifestEvaluator.forRowFilter(greaterThan("uuid", aboveMax), SPEC, true).eval(FILE); - assertThat(shouldRead).as("Should not read: uuid range above upper bound").isFalse(); + assertThat(shouldRead) + .as("Should read: signed UUID fallback finds matches with inverted bounds") + .isTrue(); } @Test @@ -989,10 +1007,15 @@ public void testUuidGtEq() { .eval(FILE); assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + // With RFC comparison, aboveMax is above the upper bound so no rows can be >= aboveMax. + // With signed comparison, UUID_MAX_VALUE.compareTo(aboveMax) = 1 (upper > lit), + // so rows might match. We try both comparators and return true if either matches. UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); shouldRead = ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", aboveMax), SPEC, true).eval(FILE); - assertThat(shouldRead).as("Should not read: uuid range above upper bound").isFalse(); + assertThat(shouldRead) + .as("Should read: signed UUID fallback finds matches with inverted bounds") + .isTrue(); } @Test @@ -1024,4 +1047,143 @@ public void testUuidIn() { ManifestEvaluator.forRowFilter(in("uuid", aboveMax, aboveMax2), SPEC, true).eval(FILE); assertThat(shouldRead).as("Should not read: uuids above upper bound").isFalse(); } + + // Tests for legacy UUID file compatibility (files written with signed UUID comparator) + // These tests simulate manifests where min/max bounds were computed using Java's signed + // comparison. + // + // Key insight: Java's UUID.compareTo() compares MSB first, then LSB, both as SIGNED longs. + // This means: + // - UUIDs starting with 0x00-0x7F are "positive" (MSB is positive as signed long) + // - UUIDs starting with 0x80-0xFF are "negative" (MSB is negative as signed long) + // + // Example file containing UUIDs: 0x00..., 0x40..., 0x80... + // - Unsigned (RFC) order: 0x00... < 0x40... < 0x80... + // - Signed (Java) order: 0x80... < 0x00... < 0x40... + // + // If written with signed comparator, the manifest would have: + // - min = 0x80... (smallest in signed order) + // - max = 0x40... (largest in signed order) + + // Legacy manifest with "inverted" bounds (min > max in unsigned order) + private static final UUID LEGACY_UUID_MIN = + UUID.fromString("80000000-0000-0000-0000-000000000001"); + private static final UUID LEGACY_UUID_MAX = + UUID.fromString("40000000-0000-0000-0000-000000000001"); + + private static final ManifestFile LEGACY_UUID_MANIFEST = + new TestHelpers.TestManifestFile( + "legacy-uuid-manifest.avro", + 1024, + 0, + System.currentTimeMillis(), + 5, + 10, + 0, + ImmutableList.of( + new TestHelpers.TestFieldSummary(false, INT_MIN, INT_MAX), + new TestHelpers.TestFieldSummary(true, null, null), + new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MAX), + new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MAX), + new TestHelpers.TestFieldSummary( + false, + toByteBuffer(Types.FloatType.get(), 0F), + toByteBuffer(Types.FloatType.get(), 20F)), + new TestHelpers.TestFieldSummary(true, null, null), + new TestHelpers.TestFieldSummary(true, false, null, null), + new TestHelpers.TestFieldSummary(false, true, null, null), + new TestHelpers.TestFieldSummary(true, true, null, null), + new TestHelpers.TestFieldSummary( + false, + false, + toByteBuffer(Types.FloatType.get(), 0F), + toByteBuffer(Types.FloatType.get(), 20F)), + new TestHelpers.TestFieldSummary(true, null, null), + new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MIN), + new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN), + new TestHelpers.TestFieldSummary( + false, + toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_MIN), + toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_MAX))), + null); + + @Test + public void testLegacyUuidManifestIn() { + // Query: uuid IN (0x20..., 0x30...) + // Both are between 0x80... and 0x40... in signed order + // RFC evaluation: Both values are < 0x80... (lower bound) in unsigned, so filtered out + // Legacy evaluation: Uses signed comparator, so 0x80... < 0x20... < 0x30... < 0x40... + UUID uuid1 = UUID.fromString("20000000-0000-0000-0000-000000000001"); + UUID uuid2 = UUID.fromString("30000000-0000-0000-0000-000000000001"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(in("uuid", uuid1, uuid2), SPEC, true) + .eval(LEGACY_UUID_MANIFEST); + assertThat(shouldRead) + .as("Should read: legacy fallback should find UUIDs in signed-ordered bounds") + .isTrue(); + } + + @Test + public void testLegacyUuidManifestInAllBelowLowerBound() { + // Query: uuid IN (0x50..., 0x60...) + // Both are > 0x40... (upper bound) in both unsigned and signed order + // RFC evaluation: Both values are < 0x80... (lower bound) in unsigned, so filtered out + // Legacy evaluation: Both values are > 0x40... (upper bound) in signed order, so filtered out + // So both comparators should filter these out + UUID uuid1 = UUID.fromString("50000000-0000-0000-0000-000000000001"); + UUID uuid2 = UUID.fromString("60000000-0000-0000-0000-000000000001"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(in("uuid", uuid1, uuid2), SPEC, true) + .eval(LEGACY_UUID_MANIFEST); + // In signed order: 0x80... < 0x90... < ... < 0xFF... < 0x00... < ... < 0x40... < 0x50... < + // 0x60... + // So 0x50... and 0x60... are above upper bound [0x40...] in signed order + // In RFC order: 0x50... and 0x60... are below lower bound [0x80...] + assertThat(shouldRead) + .as("Should not read: UUIDs outside bounds in both RFC and signed order") + .isFalse(); + } + + @Test + public void testLegacyUuidManifestEq() { + // Query for 0x20... which is between 0x80... and 0x40... in signed order + UUID queryUuid = UUID.fromString("20000000-0000-0000-0000-000000000001"); + + // RFC evaluation would fail: 0x20... < 0x80... in unsigned, so outside [0x80..., 0x40...] + // Legacy evaluation should succeed: 0x80... < 0x20... < 0x40... in signed + boolean shouldRead = + ManifestEvaluator.forRowFilter(equal("uuid", queryUuid), SPEC, true) + .eval(LEGACY_UUID_MANIFEST); + assertThat(shouldRead) + .as("Should read: legacy fallback should find UUID in signed-ordered bounds") + .isTrue(); + } + + @Test + public void testLegacyUuidManifestLt() { + // Query: uuid < 0x30... + // In signed order, values less than 0x30... include 0x80..., 0x00..., 0x20... + // The manifest's min is 0x80... which is < 0x30... in signed order + UUID queryUuid = UUID.fromString("30000000-0000-0000-0000-000000000001"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(lessThan("uuid", queryUuid), SPEC, true) + .eval(LEGACY_UUID_MANIFEST); + assertThat(shouldRead) + .as("Should read: legacy fallback should find UUIDs less than query in signed order") + .isTrue(); + } + + @Test + public void testLegacyUuidManifestGt() { + // Query: uuid > 0x20... + // In signed order, values greater than 0x20... include 0x30..., 0x40... + // The manifest's max is 0x40... which is > 0x20... in signed order + UUID queryUuid = UUID.fromString("20000000-0000-0000-0000-000000000001"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("uuid", queryUuid), SPEC, true) + .eval(LEGACY_UUID_MANIFEST); + assertThat(shouldRead) + .as("Should read: legacy fallback should find UUIDs greater than query in signed order") + .isTrue(); + } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java index a723746dadda..1ec7c54898bd 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java @@ -1178,11 +1178,18 @@ public void testUuidEq() { @Test public void testUuidLt() { + // With RFC comparison, belowMin is below the lower bound so no rows can be < belowMin. + // With signed comparison, UUID_MIN_VALUE.compareTo(belowMin) = -1 (lower < lit), + // so rows might match. We try both comparators and return true if either matches. UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", belowMin)).eval(FILE); - assertThat(shouldRead).as("Should not read: uuid range below lower bound").isFalse(); + assertThat(shouldRead) + .as("Should read: signed UUID fallback finds matches with inverted bounds") + .isTrue(); + // UUID_MIN_VALUE is the lower bound, so no rows can be < UUID_MIN_VALUE. + // Both RFC and signed comparators agree on this since we're comparing the value to itself. shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", UUID_MIN_VALUE)).eval(FILE); assertThat(shouldRead) .as("Should not read: uuid range below lower bound (UUID_MIN is not < UUID_MIN)") @@ -1202,10 +1209,15 @@ public void testUuidLt() { @Test public void testUuidLtEq() { + // With RFC comparison, belowMin is below the lower bound so no rows can be <= belowMin. + // However, we also try signed comparison for backward compatibility, and with signed comparison + // the bounds are inverted, so rows might match. UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("uuid", belowMin)).eval(FILE); - assertThat(shouldRead).as("Should not read: uuid range below lower bound").isFalse(); + assertThat(shouldRead) + .as("Should read: signed UUID fallback may find matches with inverted bounds") + .isTrue(); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("uuid", UUID_MIN_VALUE)).eval(FILE); @@ -1237,15 +1249,22 @@ public void testUuidGt() { new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", justBelowMax)).eval(FILE); assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + // UUID_MAX_VALUE is the upper bound, so no rows can be > UUID_MAX_VALUE. + // Both RFC and signed comparators agree on this since we're comparing the value to itself. shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", UUID_MAX_VALUE)).eval(FILE); assertThat(shouldRead) .as("Should not read: uuid range above upper bound (UUID_MAX is not > UUID_MAX)") .isFalse(); + // With RFC comparison, aboveMax is above the upper bound so no rows can be > aboveMax. + // With signed comparison, UUID_MAX_VALUE.compareTo(aboveMax) = 1 (upper > lit), + // so rows might match. We try both comparators and return true if either matches. UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", aboveMax)).eval(FILE); - assertThat(shouldRead).as("Should not read: uuid range above upper bound").isFalse(); + assertThat(shouldRead) + .as("Should read: signed UUID fallback finds matches with inverted bounds") + .isTrue(); } @Test @@ -1265,10 +1284,15 @@ public void testUuidGtEq() { .eval(FILE); assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + // With RFC comparison, aboveMax is above the upper bound so no rows can be >= aboveMax. + // With signed comparison, UUID_MAX_VALUE.compareTo(aboveMax) = 1 (upper > lit), + // so rows might match. We try both comparators and return true if either matches. UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("uuid", aboveMax)).eval(FILE); - assertThat(shouldRead).as("Should not read: uuid range above upper bound").isFalse(); + assertThat(shouldRead) + .as("Should read: signed UUID fallback finds matches with inverted bounds") + .isTrue(); } @Test @@ -1338,4 +1362,114 @@ public void testUuidNotIn() { shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notIn("uuid", middle1, middle2)).eval(FILE); assertThat(shouldRead).as("Should read: notIn always reads").isTrue(); } + + // Tests for legacy UUID file compatibility (files written with signed UUID comparator) + // These tests simulate files where min/max bounds were computed using Java's signed comparison. + // + // Key insight: Java's UUID.compareTo() compares MSB first, then LSB, both as SIGNED longs. + // This means: + // - UUIDs starting with 0x00-0x7F are "positive" (MSB is positive as signed long) + // - UUIDs starting with 0x80-0xFF are "negative" (MSB is negative as signed long) + // + // Example file containing UUIDs: 0x00..., 0x40..., 0x80... + // - Unsigned (RFC) order: 0x00... < 0x40... < 0x80... + // - Signed (Java) order: 0x80... < 0x00... < 0x40... + // + // If written with signed comparator, the file would have: + // - min = 0x80... (smallest in signed order) + // - max = 0x40... (largest in signed order) + // + // Query for 0x20... (which is in the file): + // - RFC evaluation: Is 0x20... in [0x80..., 0x40...]? + // In unsigned: 0x20... < 0x80... (lower bound), so NO - incorrectly skipped! + // - Legacy evaluation: Is 0x20... in [0x80..., 0x40...]? + // In signed: 0x80... < 0x20... < 0x40..., so YES - correctly included! + + // File contains UUIDs from 0x80... to 0x40... (in signed order) + // This represents actual values: 0x80..., 0x00..., 0x20..., 0x40... + private static final UUID LEGACY_SIGNED_MIN = + UUID.fromString("80000000-0000-0000-0000-000000000001"); // Smallest in signed order + private static final UUID LEGACY_SIGNED_MAX = + UUID.fromString("40000000-0000-0000-0000-000000000001"); // Largest in signed order + + private static final DataFile LEGACY_UUID_FILE = + new TestDataFile( + "legacy_uuid_file.avro", + Row.of(), + 50, + ImmutableMap.of(15, 50L), + ImmutableMap.of(15, 0L), + null, + // lower bound: 0x80... (smallest in signed comparison) + ImmutableMap.of(15, toByteBuffer(Types.UUIDType.get(), LEGACY_SIGNED_MIN)), + // upper bound: 0x40... (largest in signed comparison) + ImmutableMap.of(15, toByteBuffer(Types.UUIDType.get(), LEGACY_SIGNED_MAX))); + + @Test + public void testLegacyUuidFileEq() { + // Query for 0x20... which is between 0x80... and 0x40... in signed order + // (i.e., it's in the file's actual range) + UUID queryUuid = UUID.fromString("20000000-0000-0000-0000-000000000001"); + + // RFC evaluation would fail: 0x20... < 0x80... in unsigned, so outside [0x80..., 0x40...] + // Legacy evaluation should succeed: 0x80... < 0x20... < 0x40... in signed + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", queryUuid)).eval(LEGACY_UUID_FILE); + assertThat(shouldRead) + .as("Should read: legacy fallback should find UUID in signed-ordered bounds") + .isTrue(); + } + + @Test + public void testLegacyUuidFileLt() { + // Query: uuid < 0x30... + // In signed order, values less than 0x30... include 0x80..., 0x00..., 0x20... + // The file contains 0x80... which is < 0x30... in signed order + UUID queryUuid = UUID.fromString("30000000-0000-0000-0000-000000000001"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", queryUuid)).eval(LEGACY_UUID_FILE); + assertThat(shouldRead) + .as("Should read: legacy fallback should find UUIDs less than query in signed order") + .isTrue(); + } + + @Test + public void testLegacyUuidFileGt() { + // Query: uuid > 0x20... + // In signed order, values greater than 0x20... include 0x30..., 0x40... + // The file's max is 0x40... which is > 0x20... in signed order + UUID queryUuid = UUID.fromString("20000000-0000-0000-0000-000000000001"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", queryUuid)) + .eval(LEGACY_UUID_FILE); + assertThat(shouldRead) + .as("Should read: legacy fallback should find UUIDs greater than query in signed order") + .isTrue(); + } + + @Test + public void testLegacyUuidFileIn() { + // Query: uuid IN (0x20..., 0x30...) + // Both are between 0x80... and 0x40... in signed order + // RFC evaluation: Both values are < 0x80... (lower bound) in unsigned, so filtered out + // Legacy evaluation: Uses signed comparator, so 0x80... < 0x20... < 0x30... < 0x40... + UUID uuid1 = UUID.fromString("20000000-0000-0000-0000-000000000001"); + UUID uuid2 = UUID.fromString("30000000-0000-0000-0000-000000000001"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, in("uuid", uuid1, uuid2)).eval(LEGACY_UUID_FILE); + assertThat(shouldRead) + .as("Should read: legacy fallback should find UUIDs in signed-ordered bounds") + .isTrue(); + } + + @Test + public void testNonUuidPredicateNoLegacyFallback() { + // Verify that non-UUID predicates don't trigger legacy fallback (no performance impact) + // This is a sanity check - the evaluator should work normally for non-UUID columns + boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("id", 50)).eval(FILE); + assertThat(shouldRead).as("Should read: id 50 is in range [30, 79]").isTrue(); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("id", 100)).eval(FILE); + assertThat(shouldRead).as("Should not read: id 100 is above range [30, 79]").isFalse(); + } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java index f34cd730df77..02ef5662b261 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java @@ -39,6 +39,7 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; +import java.util.UUID; import org.apache.iceberg.DataFile; import org.apache.iceberg.Schema; import org.apache.iceberg.TestHelpers.Row; @@ -73,7 +74,8 @@ public class TestStrictMetricsEvaluator { Types.StructType.of( Types.NestedField.optional(16, "nested_col_no_stats", Types.IntegerType.get()), Types.NestedField.optional( - 17, "nested_col_with_stats", Types.IntegerType.get())))); + 17, "nested_col_with_stats", Types.IntegerType.get()))), + optional(18, "uuid", Types.UUIDType.get())); private static final int INT_MIN_VALUE = 30; private static final int INT_MAX_VALUE = 79; @@ -684,4 +686,154 @@ SCHEMA, lessThanOrEqual("struct.nested_col_with_stats", INT_MAX_VALUE)) new StrictMetricsEvaluator(SCHEMA, notNull("struct.nested_col_with_stats")).eval(FILE); assertThat(shouldRead).as("notNull nested column should not match").isFalse(); } + + // Tests for UUID with StrictMetricsEvaluator using RFC-compliant comparison only + + private static final UUID UUID_MIN = UUID.fromString("00000000-0000-0000-0000-000000000001"); + private static final UUID UUID_MAX = UUID.fromString("80000000-0000-0000-0000-000000000001"); + + private static final DataFile UUID_FILE = + new TestDataFile( + "uuid_file.avro", + Row.of(), + 50, + ImmutableMap.of(18, 50L), + ImmutableMap.of(18, 0L), + null, + ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), UUID_MIN)), + ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), UUID_MAX))); + + @Test + public void testStrictUuidGt() { + // Query: uuid > 0x00... (all UUIDs in file should be > this) + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean allMatch = + new StrictMetricsEvaluator(SCHEMA, greaterThan("uuid", belowMin)).eval(UUID_FILE); + // With bounds [UUID_MIN, UUID_MAX], all values should be > belowMin + assertThat(allMatch).as("All UUIDs should be greater than belowMin").isTrue(); + } + + @Test + public void testStrictUuidLt() { + // Query: uuid < 0xFF... (all UUIDs in file should be < this) + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + boolean allMatch = + new StrictMetricsEvaluator(SCHEMA, lessThan("uuid", aboveMax)).eval(UUID_FILE); + // With bounds [UUID_MIN, UUID_MAX], all values should be < aboveMax + assertThat(allMatch).as("All UUIDs should be less than aboveMax").isTrue(); + } + + @Test + public void testStrictUuidEqNeverMatchesRange() { + // Strict eq should never match when there's a range of values + UUID middle = UUID.fromString("40000000-0000-0000-0000-000000000001"); + boolean allMatch = new StrictMetricsEvaluator(SCHEMA, equal("uuid", middle)).eval(UUID_FILE); + assertThat(allMatch).as("Strict eq should not match range").isFalse(); + } + + @Test + public void testStrictUuidInNeverMatchesRange() { + // Strict IN should never match when there's a range of values (lower != upper) + UUID middle1 = UUID.fromString("40000000-0000-0000-0000-000000000001"); + UUID middle2 = UUID.fromString("50000000-0000-0000-0000-000000000001"); + boolean allMatch = + new StrictMetricsEvaluator(SCHEMA, in("uuid", middle1, middle2)).eval(UUID_FILE); + assertThat(allMatch).as("Strict IN should not match range").isFalse(); + } + + // File with single UUID value (lower == upper) + private static final UUID SINGLE_UUID = UUID.fromString("40000000-0000-0000-0000-000000000001"); + private static final DataFile SINGLE_UUID_FILE = + new TestDataFile( + "single_uuid_file.avro", + Row.of(), + 50, + ImmutableMap.of(18, 50L), + ImmutableMap.of(18, 0L), + null, + ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), SINGLE_UUID)), + ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), SINGLE_UUID))); + + @Test + public void testStrictUuidInMatchesSingleValue() { + // Strict IN should match when lower == upper and the value is in the set + boolean allMatch = + new StrictMetricsEvaluator(SCHEMA, in("uuid", SINGLE_UUID)).eval(SINGLE_UUID_FILE); + assertThat(allMatch).as("Strict IN should match single value in set").isTrue(); + } + + @Test + public void testStrictUuidInDoesNotMatchWhenValueNotInSet() { + // Strict IN should not match when lower == upper but the value is not in the set + UUID otherUuid = UUID.fromString("50000000-0000-0000-0000-000000000001"); + boolean allMatch = + new StrictMetricsEvaluator(SCHEMA, in("uuid", otherUuid)).eval(SINGLE_UUID_FILE); + assertThat(allMatch).as("Strict IN should not match when value not in set").isFalse(); + } + + @Test + public void testStrictUuidNotInMatchesWhenAllValuesOutsideBounds() { + // Strict NOT IN should match when all values in the set are outside the file's bounds + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + boolean allMatch = + new StrictMetricsEvaluator(SCHEMA, notIn("uuid", belowMin, aboveMax)).eval(UUID_FILE); + // All values in the set are outside [UUID_MIN, UUID_MAX], so all rows match NOT IN + assertThat(allMatch).as("Strict NOT IN should match when all values outside bounds").isTrue(); + } + + @Test + public void testStrictUuidNotInDoesNotMatchWhenValueInBounds() { + // Strict NOT IN should not match when a value in the set is within bounds + UUID middle = UUID.fromString("40000000-0000-0000-0000-000000000001"); + boolean allMatch = new StrictMetricsEvaluator(SCHEMA, notIn("uuid", middle)).eval(UUID_FILE); + // middle is within [UUID_MIN, UUID_MAX], so some rows might match the value + assertThat(allMatch).as("Strict NOT IN should not match when value in bounds").isFalse(); + } + + // Tests for file with inverted UUID bounds (as would be written by legacy signed comparator) + // In RFC unsigned order: 0x40... < 0x80..., so lower > upper when interpreted with RFC + private static final UUID LEGACY_UUID_LOWER = + UUID.fromString("80000000-0000-0000-0000-000000000001"); + private static final UUID LEGACY_UUID_UPPER = + UUID.fromString("40000000-0000-0000-0000-000000000001"); + + private static final DataFile LEGACY_UUID_FILE = + new TestDataFile( + "legacy_uuid_file.avro", + Row.of(), + 50, + ImmutableMap.of(18, 50L), + ImmutableMap.of(18, 0L), + null, + ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_LOWER)), + ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_UPPER))); + + @Test + public void testStrictUuidInWithInvertedBounds() { + // With inverted bounds [0x80..., 0x40...] where lower > upper in RFC order, + // strict IN should never match since lower != upper + UUID uuid1 = UUID.fromString("20000000-0000-0000-0000-000000000001"); + UUID uuid2 = UUID.fromString("30000000-0000-0000-0000-000000000001"); + boolean allMatch = + new StrictMetricsEvaluator(SCHEMA, in("uuid", uuid1, uuid2)).eval(LEGACY_UUID_FILE); + assertThat(allMatch).as("Strict IN should not match when lower != upper").isFalse(); + } + + @Test + public void testStrictUuidNotInWithInvertedBounds() { + // Query: NOT IN (0x50..., 0x60...) + // With inverted bounds [0x80..., 0x40...], in RFC order: 0x40... < 0x50... < 0x60... < 0x80... + // The values 0x50... and 0x60... are between upper (0x40...) and lower (0x80...) in RFC order + // Since bounds are inverted, the evaluator sees lower (0x80...) > values, so they appear + // below the lower bound, and strict NOT IN returns true + UUID uuid1 = UUID.fromString("50000000-0000-0000-0000-000000000001"); + UUID uuid2 = UUID.fromString("60000000-0000-0000-0000-000000000001"); + boolean allMatch = + new StrictMetricsEvaluator(SCHEMA, notIn("uuid", uuid1, uuid2)).eval(LEGACY_UUID_FILE); + // With RFC-only comparison, values appear below lower bound (0x80...), so all rows match NOT IN + assertThat(allMatch) + .as("Strict NOT IN should match when all values below lower bound in RFC order") + .isTrue(); + } } diff --git a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java index 3cb46b309d82..284cd980f1a4 100644 --- a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java +++ b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java @@ -1138,6 +1138,150 @@ public void testUUID() { .isTrue(); } + // UUIDs for testing dual-comparator behavior with high-bit values + // These UUIDs span the signed/unsigned comparison boundary + private static final UUID UUID_LOW = UUID.fromString("00000000-0000-0000-0000-000000000001"); + private static final UUID UUID_MID = UUID.fromString("40000000-0000-0000-0000-000000000001"); + private static final UUID UUID_HIGH = UUID.fromString("80000000-0000-0000-0000-000000000001"); + private static final UUID UUID_HIGHER = UUID.fromString("c0000000-0000-0000-0000-000000000001"); + + private static final Schema UUID_SCHEMA = + new Schema( + required(1, "id", IntegerType.get()), optional(2, "uuid_col", Types.UUIDType.get())); + + /** + * Tests UUID filtering with values that span the signed/unsigned comparison boundary. In RFC 9562 + * unsigned comparison: UUID_LOW < UUID_MID < UUID_HIGH < UUID_HIGHER In legacy signed comparison: + * UUID_HIGH < UUID_HIGHER < UUID_LOW < UUID_MID (high bit treated as negative) + * + *

The dual-comparator approach ensures we don't incorrectly skip files that might contain + * matching rows when reading legacy files with inverted UUID bounds. + */ + @TestTemplate + public void testUUIDWithHighBitValues() throws IOException { + assumeThat(format).as("Only valid for Parquet").isEqualTo(FileFormat.PARQUET); + + // Create a file with UUIDs spanning the signed/unsigned boundary + List records = Lists.newArrayList(); + UUID[] uuids = {UUID_LOW, UUID_MID, UUID_HIGH, UUID_HIGHER}; + for (int i = 0; i < uuids.length; i++) { + GenericRecord record = GenericRecord.create(UUID_SCHEMA); + record.setField("id", i); + record.setField("uuid_col", uuids[i]); + records.add(record); + } + + File parquetFile = writeParquetFile("uuid-high-bit-test", UUID_SCHEMA, records); + InputFile inFile = Files.localInput(parquetFile); + try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { + BlockMetaData blockMetaData = reader.getRowGroups().get(0); + MessageType fileSchema = reader.getFileMetaData().getSchema(); + + // Test equality - should find exact matches + assertThat(shouldReadUUID(equal("uuid_col", UUID_HIGH), fileSchema, blockMetaData)) + .as("Should read: file contains UUID_HIGH") + .isTrue(); + + // Test less than with high-bit UUID + // Query: uuid < UUID_HIGH (should match UUID_LOW and UUID_MID in RFC order) + assertThat(shouldReadUUID(lessThan("uuid_col", UUID_HIGH), fileSchema, blockMetaData)) + .as("Should read: file contains values less than UUID_HIGH") + .isTrue(); + + // Test greater than with low UUID + // Query: uuid > UUID_LOW (should match UUID_MID, UUID_HIGH, UUID_HIGHER in RFC order) + assertThat(shouldReadUUID(greaterThan("uuid_col", UUID_LOW), fileSchema, blockMetaData)) + .as("Should read: file contains values greater than UUID_LOW") + .isTrue(); + + // Test greater than with highest UUID - should not match any + UUID aboveAll = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + assertThat(shouldReadUUID(greaterThan("uuid_col", aboveAll), fileSchema, blockMetaData)) + .as("Should skip: no values greater than max UUID") + .isFalse(); + + // Test less than with lowest UUID - should not match any + UUID belowAll = UUID.fromString("00000000-0000-0000-0000-000000000000"); + assertThat(shouldReadUUID(lessThan("uuid_col", belowAll), fileSchema, blockMetaData)) + .as("Should skip: no values less than min UUID") + .isFalse(); + + // Test IN with high-bit UUIDs + assertThat(shouldReadUUID(in("uuid_col", UUID_HIGH, UUID_HIGHER), fileSchema, blockMetaData)) + .as("Should read: file contains one of the IN values") + .isTrue(); + + // Test IN with UUID outside file bounds (above max) + // The file's max is UUID_HIGHER (0xc0...), so 0xff... is outside bounds + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + assertThat(shouldReadUUID(in("uuid_col", aboveMax), fileSchema, blockMetaData)) + .as("Should skip: value is above file's max bound") + .isFalse(); + + // Test NOT IN - should read when file contains values not in the set + assertThat(shouldReadUUID(notIn("uuid_col", UUID_LOW), fileSchema, blockMetaData)) + .as("Should read: file contains values not in the exclusion set") + .isTrue(); + } + } + + /** + * Tests that the dual-comparator approach correctly handles queries that would give different + * results with signed vs unsigned comparison. This is critical for backward compatibility with + * legacy files that may have UUID bounds computed with signed comparison. + */ + @TestTemplate + public void testUUIDComparisonBoundary() throws IOException { + assumeThat(format).as("Only valid for Parquet").isEqualTo(FileFormat.PARQUET); + + // Create a file with only high-bit UUIDs (0x80... and above) + // These are the UUIDs that would be ordered differently in signed vs unsigned comparison + List records = Lists.newArrayList(); + UUID[] highBitUuids = {UUID_HIGH, UUID_HIGHER}; + for (int i = 0; i < highBitUuids.length; i++) { + GenericRecord record = GenericRecord.create(UUID_SCHEMA); + record.setField("id", i); + record.setField("uuid_col", highBitUuids[i]); + records.add(record); + } + + File parquetFile = writeParquetFile("uuid-high-only-test", UUID_SCHEMA, records); + InputFile inFile = Files.localInput(parquetFile); + try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { + BlockMetaData blockMetaData = reader.getRowGroups().get(0); + MessageType fileSchema = reader.getFileMetaData().getSchema(); + + // Query: uuid > UUID_MID (0x40...) + // In RFC unsigned order: UUID_HIGH (0x80...) > UUID_MID, so should match + // In signed order: UUID_HIGH (0x80...) < UUID_MID (negative vs positive), so would not match + // The dual-comparator should ensure we read this file + assertThat(shouldReadUUID(greaterThan("uuid_col", UUID_MID), fileSchema, blockMetaData)) + .as("Should read: file contains UUIDs greater than UUID_MID in RFC order") + .isTrue(); + + // Query: uuid < UUID_MID (0x40...) + // In RFC unsigned order: all values (0x80..., 0xc0...) > UUID_MID, so should NOT match + // In signed order: all values would be < UUID_MID (negative < positive), so would match + // The dual-comparator returns true (should read) because signed comparison says there + // might be matches - this is the conservative behavior for backward compatibility + assertThat(shouldReadUUID(lessThan("uuid_col", UUID_MID), fileSchema, blockMetaData)) + .as("Should read: dual-comparator is conservative for legacy file compatibility") + .isTrue(); + + // Query: uuid > UUID_LOW (0x00...) + // Both comparators agree: all high-bit UUIDs are greater than UUID_LOW + assertThat(shouldReadUUID(greaterThan("uuid_col", UUID_LOW), fileSchema, blockMetaData)) + .as("Should read: all values are greater than UUID_LOW") + .isTrue(); + } + } + + private boolean shouldReadUUID( + Expression expression, MessageType messageType, BlockMetaData blockMetaData) { + return new ParquetMetricsRowGroupFilter(UUID_SCHEMA, expression, true) + .shouldRead(messageType, blockMetaData); + } + private boolean shouldRead(Expression expression) { return shouldRead(expression, true); } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java index cae9447513c0..70ab1f2372c9 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java @@ -30,6 +30,7 @@ import org.apache.iceberg.expressions.Bound; import org.apache.iceberg.expressions.BoundReference; import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.ExpressionUtil; import org.apache.iceberg.expressions.ExpressionVisitors; import org.apache.iceberg.expressions.ExpressionVisitors.BoundExpressionVisitor; import org.apache.iceberg.expressions.Expressions; @@ -51,6 +52,9 @@ public class ParquetMetricsRowGroupFilter { private final Schema schema; private final Expression expr; + // Expression using signed UUID comparator for backward compatibility with files written before + // RFC-compliant UUID comparisons were introduced. Null if no UUID predicates. + private final Expression signedUuidExpr; public ParquetMetricsRowGroupFilter(Schema schema, Expression unbound) { this(schema, unbound, true); @@ -59,7 +63,17 @@ public ParquetMetricsRowGroupFilter(Schema schema, Expression unbound) { public ParquetMetricsRowGroupFilter(Schema schema, Expression unbound, boolean caseSensitive) { this.schema = schema; StructType struct = schema.asStruct(); - this.expr = Binder.bind(struct, Expressions.rewriteNot(unbound), caseSensitive); + Expression rewritten = Expressions.rewriteNot(unbound); + this.expr = Binder.bind(struct, rewritten, caseSensitive); + + // Only create the signed UUID expression if there are UUID predicates that compare against + // bounds + if (ExpressionUtil.hasUUIDBoundsPredicate(this.expr)) { + Expression signedRewritten = ExpressionUtil.withSignedUUIDComparator(rewritten); + this.signedUuidExpr = Binder.bind(struct, signedRewritten, caseSensitive); + } else { + this.signedUuidExpr = null; + } } /** @@ -70,7 +84,19 @@ public ParquetMetricsRowGroupFilter(Schema schema, Expression unbound, boolean c * @return false if the file cannot contain rows that match the expression, true otherwise. */ public boolean shouldRead(MessageType fileSchema, BlockMetaData rowGroup) { - return new MetricsEvalVisitor().eval(fileSchema, rowGroup); + boolean result = new MetricsEvalVisitor().eval(fileSchema, rowGroup, expr, false); + + // If the RFC-compliant evaluation says rows might match, or there's no signed UUID expression, + // return the result. + if (result || signedUuidExpr == null) { + return result; + } + + // Try again with signed UUID comparator. There is no quick way to detect + // which comparator was used when the file's column metrics were written. + // The signedUuidExpr has literals with signed comparators for lt/gt/eq predicates. + // For IN predicates, signed comparator is used via MetricsEvalVisitor#comparatorForIn. + return new MetricsEvalVisitor().eval(fileSchema, rowGroup, signedUuidExpr, true); } private static final boolean ROWS_MIGHT_MATCH = true; @@ -80,8 +106,16 @@ private class MetricsEvalVisitor extends BoundExpressionVisitor { private Map> stats = null; private Map valueCounts = null; private Map> conversions = null; - - private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) { + // Flag to use signed UUID comparator for backward compatibility. + // This is needed for the IN predicate because the comparator information is lost + // when binding converts literals to a Set of raw values. + private boolean useSignedUuidComparator = false; + + private boolean eval( + MessageType fileSchema, + BlockMetaData rowGroup, + Expression expression, + boolean signedUuidMode) { if (rowGroup.getRowCount() <= 0) { return ROWS_CANNOT_MATCH; } @@ -89,6 +123,7 @@ private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) { this.stats = Maps.newHashMap(); this.valueCounts = Maps.newHashMap(); this.conversions = Maps.newHashMap(); + this.useSignedUuidComparator = signedUuidMode; for (ColumnChunkMetaData col : rowGroup.getColumns()) { PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType(); if (colType.getId() != null) { @@ -100,7 +135,21 @@ private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) { } } - return ExpressionVisitors.visitEvaluator(expr, this); + return ExpressionVisitors.visitEvaluator(expression, this); + } + + /** + * Returns the appropriate comparator for the given reference. This is needed for the IN + * predicate because the comparator information is lost when binding converts literals to a Set + * of raw values. For other predicates, the literal carries the comparator. + */ + @SuppressWarnings("unchecked") + private Comparator comparatorForIn(BoundReference ref) { + if (useSignedUuidComparator + && ref.type().typeId() == org.apache.iceberg.types.Type.TypeID.UUID) { + return (Comparator) Comparators.signedUUIDs(); + } + return ref.comparator(); } @Override @@ -406,20 +455,17 @@ public Boolean in(BoundReference ref, Set literalSet) { return ROWS_MIGHT_MATCH; } + Comparator cmp = comparatorForIn(ref); T lower = min(colStats, id); literals = - literals.stream() - .filter(v -> ref.comparator().compare(lower, v) <= 0) - .collect(Collectors.toList()); + literals.stream().filter(v -> cmp.compare(lower, v) <= 0).collect(Collectors.toList()); if (literals.isEmpty()) { // if all values are less than lower bound, rows cannot match. return ROWS_CANNOT_MATCH; } T upper = max(colStats, id); literals = - literals.stream() - .filter(v -> ref.comparator().compare(upper, v) >= 0) - .collect(Collectors.toList()); + literals.stream().filter(v -> cmp.compare(upper, v) >= 0).collect(Collectors.toList()); if (literals .isEmpty()) { // if all remaining values are greater than upper bound, rows cannot // match. From edd63787530902d7edac432f967a329f745def2d Mon Sep 17 00:00:00 2001 From: bodduv Date: Wed, 21 Jan 2026 22:07:58 +0100 Subject: [PATCH 03/11] Do only a single tree traversal; address comments --- .../iceberg/expressions/ExpressionUtil.java | 132 ++++-------------- .../InclusiveMetricsEvaluator.java | 22 ++- .../apache/iceberg/expressions/Literals.java | 1 + .../expressions/ManifestEvaluator.java | 17 +-- .../expressions/TestExpressionUtil.java | 114 +++++++-------- .../parquet/ParquetMetricsRowGroupFilter.java | 14 +- 6 files changed, 99 insertions(+), 201 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java index 1f0ab15f4fcd..8281d03ed9dd 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java @@ -24,6 +24,7 @@ import java.time.temporal.ChronoUnit; import java.util.List; import java.util.Locale; +import java.util.Optional; import java.util.UUID; import java.util.concurrent.TimeUnit; import java.util.function.Function; @@ -747,118 +748,36 @@ private static PartitionSpec identitySpec(Schema schema, int... ids) { } /** - * Checks if the given bound expression contains any UUID predicates that compare against min/max - * bounds. These predicates may produce incorrect results when evaluating against statistics - * written with a different UUID comparator. + * Transform UUID literals in an unbound expression to use signed comparators, if the expression + * contains UUID bounds predicates. This maintains backward compatibility with files written + * before RFC 4122/9562 compliant comparison was implemented. * - * @param expr a bound expression - * @return true if the expression contains UUID predicates that compare against bounds - */ - public static boolean hasUUIDBoundsPredicate(Expression expr) { - return ExpressionVisitors.visit(expr, new UUIDBoundsPredicateDetector()); - } - - /** - * Transforms an unbound expression to use the signed UUID comparator in all UUID literals. This - * is used for backward compatibility with files written before RFC 4122/9562 compliant comparison - * was implemented. + *

The transformed expression contains literals with signed comparators for lt/gt/eq + * predicates. For IN predicates, the comparator information is lost when binding converts + * literals to a Set of raw values, so the evaluator must handle this separately. * * @param expr an unbound expression - * @return a new expression with UUID literals using the signed comparator + * @return Optional containing the transformed expression if UUID bounds predicates exist, or + * empty if no UUID bounds predicates were found */ - public static Expression withSignedUUIDComparator(Expression expr) { - return ExpressionVisitors.visit(expr, new SignedUUIDLiteralTransformer()); + public static Optional toSignedUUIDLiteral(Expression expr) { + SignedUUIDLiteralVisitor visitor = new SignedUUIDLiteralVisitor(); + Expression transformed = ExpressionVisitors.visit(expr, visitor); + return visitor.foundUUIDBoundsPredicate() ? Optional.of(transformed) : Optional.empty(); } /** - * Visitor that detects if an expression contains UUID predicates that compare against bounds. - * These include: lt, ltEq, gt, gtEq, eq, notEq, in, notIn on UUID columns. + * Visitor that transforms an expression to use the signed UUID comparator in all UUID literals, + * while also tracking whether any UUID bounds predicates were found. */ - private static class UUIDBoundsPredicateDetector - extends ExpressionVisitors.ExpressionVisitor { - - @Override - public Boolean alwaysTrue() { - return false; - } - - @Override - public Boolean alwaysFalse() { - return false; - } - - @Override - public Boolean not(Boolean result) { - return result; - } - - @Override - public Boolean and(Boolean leftResult, Boolean rightResult) { - return leftResult || rightResult; - } - - @Override - public Boolean or(Boolean leftResult, Boolean rightResult) { - return leftResult || rightResult; - } + private static class SignedUUIDLiteralVisitor + extends ExpressionVisitors.ExpressionVisitor { - @Override - public Boolean predicate(BoundPredicate pred) { - if (pred.term() instanceof BoundReference) { - BoundReference ref = (BoundReference) pred.term(); - if (ref.type().typeId() == Type.TypeID.UUID) { - switch (pred.op()) { - case LT: - case LT_EQ: - case GT: - case GT_EQ: - case EQ: - case NOT_EQ: - case IN: - case NOT_IN: - return true; - default: - return false; - } - } - } - return false; - } + private boolean foundUUIDBoundsPredicate = false; - @Override - public Boolean predicate(UnboundPredicate pred) { - // For unbound predicates, we can check if the literal is a UUID - if (pred.literal() != null && pred.literal().value() instanceof UUID) { - switch (pred.op()) { - case LT: - case LT_EQ: - case GT: - case GT_EQ: - case EQ: - case NOT_EQ: - case IN: - case NOT_IN: - return true; - default: - return false; - } - } - // Check for IN/NOT_IN with UUID literals - if (pred.literals() != null && !pred.literals().isEmpty()) { - Literal first = pred.literals().get(0); - if (first.value() instanceof UUID) { - return true; - } - } - return false; + boolean foundUUIDBoundsPredicate() { + return foundUUIDBoundsPredicate; } - } - - /** - * Visitor that transforms an expression to use the signed UUID comparator in all UUID literals. - */ - private static class SignedUUIDLiteralTransformer - extends ExpressionVisitors.ExpressionVisitor { @Override public Expression alwaysTrue() { @@ -911,19 +830,25 @@ public Expression predicate(UnboundPredicate pred) { case GT_EQ: case EQ: case NOT_EQ: - case STARTS_WITH: - case NOT_STARTS_WITH: Literal lit = pred.literal(); if (lit.value() instanceof UUID) { + foundUUIDBoundsPredicate = true; Literals.UUIDLiteral uuidLit = (Literals.UUIDLiteral) lit; return new UnboundPredicate<>(pred.op(), term, (T) uuidLit.withSignedComparator()); } + + return pred; + + case STARTS_WITH: + case NOT_STARTS_WITH: + // These operations don't apply to UUIDs, no transformation needed return pred; case IN: case NOT_IN: List> literals = pred.literals(); if (!literals.isEmpty() && literals.get(0).value() instanceof UUID) { + foundUUIDBoundsPredicate = true; List transformedValues = literals.stream() .map( @@ -934,6 +859,7 @@ public Expression predicate(UnboundPredicate pred) { .collect(Collectors.toList()); return new UnboundPredicate<>(pred.op(), term, transformedValues); } + return pred; default: diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java index 6b6cc0b96560..494713bf87bf 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java @@ -57,8 +57,9 @@ public class InclusiveMetricsEvaluator { private static final int IN_PREDICATE_LIMIT = 200; private final Expression expr; - // Expression using signed UUID comparator for backward compatibility with files written before - // RFC-compliant UUID comparisons were introduced. Null if no UUID predicates. + // Expression using signed UUID comparator for backward compatibility with files written + // prior to the introduction of RFC-compliant UUID comparisons. + // Null if there are no UUID predicates comparing against bounds in the expression. private final Expression signedUuidExpr; public InclusiveMetricsEvaluator(Schema schema, Expression unbound) { @@ -70,14 +71,11 @@ public InclusiveMetricsEvaluator(Schema schema, Expression unbound, boolean case Expression rewritten = rewriteNot(unbound); this.expr = Binder.bind(struct, rewritten, caseSensitive); - // Only create the signed UUID expression if there are UUID predicates that compare against - // bounds - if (ExpressionUtil.hasUUIDBoundsPredicate(this.expr)) { - Expression signedRewritten = ExpressionUtil.withSignedUUIDComparator(rewritten); - this.signedUuidExpr = Binder.bind(struct, signedRewritten, caseSensitive); - } else { - this.signedUuidExpr = null; - } + // Create the signed UUID expression iff there are UUID predicates that compare against bounds. + this.signedUuidExpr = + ExpressionUtil.toSignedUUIDLiteral(rewritten) + .map(transformed -> Binder.bind(struct, transformed, caseSensitive)) + .orElse(null); } /** @@ -98,9 +96,6 @@ public boolean eval(ContentFile file) { // Always try with signed UUID comparator as a fallback. There is no reliable way to detect // which comparator was used when the file's column metrics were written. - // The signedUuidExpr has literals with signed comparators for lt/gt/eq predicates. - // For IN predicates, we pass signedUuidMode=true so comparatorForIn() returns signed - // comparator. return new MetricsEvalVisitor().eval(file, signedUuidExpr, true); } @@ -150,6 +145,7 @@ private Comparator comparatorForIn(Bound term) { if (useSignedUuidComparator && term.ref().type().typeId() == Type.TypeID.UUID) { return (Comparator) Comparators.signedUUIDs(); } + return ((BoundTerm) term).comparator(); } diff --git a/api/src/main/java/org/apache/iceberg/expressions/Literals.java b/api/src/main/java/org/apache/iceberg/expressions/Literals.java index 40f664bfc0f4..82a69019afc1 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Literals.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Literals.java @@ -642,6 +642,7 @@ public Comparator comparator() { if (cmp == null) { cmp = useSignedComparator ? SIGNED_CMP : RFC_CMP; } + return cmp; } diff --git a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java index 867d765a71fa..7d8da8a477d8 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java @@ -72,14 +72,11 @@ private ManifestEvaluator(PartitionSpec spec, Expression partitionFilter, boolea Expression rewritten = rewriteNot(partitionFilter); this.expr = Binder.bind(partitionType, rewritten, caseSensitive); - // Only create the signed UUID expression if there are UUID predicates that compare against - // bounds - if (ExpressionUtil.hasUUIDBoundsPredicate(this.expr)) { - Expression signedRewritten = ExpressionUtil.withSignedUUIDComparator(rewritten); - this.signedUuidExpr = Binder.bind(partitionType, signedRewritten, caseSensitive); - } else { - this.signedUuidExpr = null; - } + // Create the signed UUID expression iff there are UUID predicates that compare against bounds. + this.signedUuidExpr = + ExpressionUtil.toSignedUUIDLiteral(rewritten) + .map(transformed -> Binder.bind(partitionType, transformed, caseSensitive)) + .orElse(null); } /** @@ -99,9 +96,6 @@ public boolean eval(ManifestFile manifest) { // Always try with signed UUID comparator as a fallback. There is no reliable way to detect // which comparator was used when the manifest's partition field summaries were written. - // The signedUuidExpr has literals with signed comparators for lt/gt/eq predicates. - // For IN predicates, we pass signedUuidMode=true so comparatorForIn() returns signed - // comparator. return new ManifestEvalVisitor().eval(manifest, signedUuidExpr, true); } @@ -135,6 +129,7 @@ private Comparator comparatorForIn(BoundReference ref) { if (useSignedUuidComparator && ref.type().typeId() == Type.TypeID.UUID) { return (Comparator) Comparators.signedUUIDs(); } + return ref.comparator(); } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java index b3a195041d8e..163e4a6642c2 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java @@ -29,6 +29,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.UUID; import java.util.regex.Pattern; import java.util.stream.IntStream; @@ -1354,119 +1355,100 @@ private void assertEquals(UnboundTransform expected, UnboundTransform pred = (UnboundPredicate) transformed; + Optional result = ExpressionUtil.toSignedUUIDLiteral(original); + assertThat(result).isPresent(); + assertThat(result.get()).isInstanceOf(UnboundPredicate.class); + UnboundPredicate pred = (UnboundPredicate) result.get(); assertThat(pred.literal().value()).isEqualTo(testUuid); // The literal should now use the signed comparator assertThat(pred.literal()).isInstanceOf(Literals.UUIDLiteral.class); } @Test - public void testWithSignedUUIDComparatorTransformsInPredicate() { + public void testToSignedUUIDLiteralTransformsInPredicate() { UUID uuid1 = UUID.randomUUID(); UUID uuid2 = UUID.randomUUID(); Expression original = Expressions.in("uuid_col", uuid1, uuid2); - Expression transformed = ExpressionUtil.withSignedUUIDComparator(original); - assertThat(transformed).isInstanceOf(UnboundPredicate.class); - UnboundPredicate pred = (UnboundPredicate) transformed; + Optional result = ExpressionUtil.toSignedUUIDLiteral(original); + assertThat(result).isPresent(); + assertThat(result.get()).isInstanceOf(UnboundPredicate.class); + UnboundPredicate pred = (UnboundPredicate) result.get(); assertThat(pred.literals()).hasSize(2); } @Test - public void testWithSignedUUIDComparatorPreservesNonUuidLiterals() { - Expression original = Expressions.equal("id", 42L); - Expression transformed = ExpressionUtil.withSignedUUIDComparator(original); - - // Should be unchanged - assertThat(transformed).isInstanceOf(UnboundPredicate.class); - UnboundPredicate pred = (UnboundPredicate) transformed; - assertThat(pred.literal().value()).isEqualTo(42L); - } - - @Test - public void testWithSignedUUIDComparatorTransformsCompoundExpression() { + public void testToSignedUUIDLiteralTransformsCompoundExpression() { UUID testUuid = UUID.randomUUID(); Expression original = Expressions.and( Expressions.equal("id", 42L), Expressions.greaterThan("uuid_col", testUuid)); - Expression transformed = ExpressionUtil.withSignedUUIDComparator(original); - assertThat(transformed).isInstanceOf(And.class); - And and = (And) transformed; + Optional result = ExpressionUtil.toSignedUUIDLiteral(original); + assertThat(result).isPresent(); + assertThat(result.get()).isInstanceOf(And.class); + And and = (And) result.get(); // Both children should be transformed assertThat(and.left()).isInstanceOf(UnboundPredicate.class); assertThat(and.right()).isInstanceOf(UnboundPredicate.class); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java index 70ab1f2372c9..b5faa9c1d58f 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java @@ -66,14 +66,11 @@ public ParquetMetricsRowGroupFilter(Schema schema, Expression unbound, boolean c Expression rewritten = Expressions.rewriteNot(unbound); this.expr = Binder.bind(struct, rewritten, caseSensitive); - // Only create the signed UUID expression if there are UUID predicates that compare against - // bounds - if (ExpressionUtil.hasUUIDBoundsPredicate(this.expr)) { - Expression signedRewritten = ExpressionUtil.withSignedUUIDComparator(rewritten); - this.signedUuidExpr = Binder.bind(struct, signedRewritten, caseSensitive); - } else { - this.signedUuidExpr = null; - } + // Create the signed UUID expression iff there are UUID predicates that compare against bounds. + this.signedUuidExpr = + ExpressionUtil.toSignedUUIDLiteral(rewritten) + .map(transformed -> Binder.bind(struct, transformed, caseSensitive)) + .orElse(null); } /** @@ -149,6 +146,7 @@ private Comparator comparatorForIn(BoundReference ref) { && ref.type().typeId() == org.apache.iceberg.types.Type.TypeID.UUID) { return (Comparator) Comparators.signedUUIDs(); } + return ref.comparator(); } From deb5bf8cda223b6711ac7a4fb1d50a26e3cabdb1 Mon Sep 17 00:00:00 2001 From: bodduv Date: Thu, 22 Jan 2026 15:25:18 +0100 Subject: [PATCH 04/11] Address comments; drop expression as a parameter of eval(); move to null checks --- .../iceberg/expressions/ExpressionUtil.java | 7 ++-- .../InclusiveMetricsEvaluator.java | 13 +++---- .../expressions/ManifestEvaluator.java | 13 +++---- .../expressions/TestExpressionUtil.java | 37 +++++++++---------- .../parquet/ParquetMetricsRowGroupFilter.java | 17 +++------ 5 files changed, 40 insertions(+), 47 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java index 8281d03ed9dd..ccd8182c8c27 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java @@ -24,7 +24,6 @@ import java.time.temporal.ChronoUnit; import java.util.List; import java.util.Locale; -import java.util.Optional; import java.util.UUID; import java.util.concurrent.TimeUnit; import java.util.function.Function; @@ -32,6 +31,7 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.StreamSupport; +import javax.annotation.Nullable; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.Table; @@ -760,10 +760,11 @@ private static PartitionSpec identitySpec(Schema schema, int... ids) { * @return Optional containing the transformed expression if UUID bounds predicates exist, or * empty if no UUID bounds predicates were found */ - public static Optional toSignedUUIDLiteral(Expression expr) { + @Nullable + public static Expression toSignedUUIDLiteral(Expression expr) { SignedUUIDLiteralVisitor visitor = new SignedUUIDLiteralVisitor(); Expression transformed = ExpressionVisitors.visit(expr, visitor); - return visitor.foundUUIDBoundsPredicate() ? Optional.of(transformed) : Optional.empty(); + return visitor.foundUUIDBoundsPredicate() ? transformed : null; } /** diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java index 494713bf87bf..6354efc586f5 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java @@ -72,10 +72,9 @@ public InclusiveMetricsEvaluator(Schema schema, Expression unbound, boolean case this.expr = Binder.bind(struct, rewritten, caseSensitive); // Create the signed UUID expression iff there are UUID predicates that compare against bounds. + Expression transformed = ExpressionUtil.toSignedUUIDLiteral(rewritten); this.signedUuidExpr = - ExpressionUtil.toSignedUUIDLiteral(rewritten) - .map(transformed -> Binder.bind(struct, transformed, caseSensitive)) - .orElse(null); + transformed != null ? Binder.bind(struct, transformed, caseSensitive) : null; } /** @@ -86,7 +85,7 @@ public InclusiveMetricsEvaluator(Schema schema, Expression unbound, boolean case */ public boolean eval(ContentFile file) { // TODO: detect the case where a column is missing from the file using file's max field id. - boolean result = new MetricsEvalVisitor().eval(file, expr, false); + boolean result = new MetricsEvalVisitor().eval(file, false); // If the RFC-compliant evaluation says rows might match, or there's no signed UUID expression, // return the result. @@ -96,7 +95,7 @@ public boolean eval(ContentFile file) { // Always try with signed UUID comparator as a fallback. There is no reliable way to detect // which comparator was used when the file's column metrics were written. - return new MetricsEvalVisitor().eval(file, signedUuidExpr, true); + return new MetricsEvalVisitor().eval(file, true); } private static final boolean ROWS_MIGHT_MATCH = true; @@ -113,7 +112,7 @@ private class MetricsEvalVisitor extends ExpressionVisitors.BoundVisitor of raw values. private boolean useSignedUuidComparator = false; - private boolean eval(ContentFile file, Expression expression, boolean signedUuidMode) { + private boolean eval(ContentFile file, boolean signedUuidMode) { if (file.recordCount() == 0) { return ROWS_CANNOT_MATCH; } @@ -132,7 +131,7 @@ private boolean eval(ContentFile file, Expression expression, boolean signedU this.upperBounds = file.upperBounds(); this.useSignedUuidComparator = signedUuidMode; - return ExpressionVisitors.visitEvaluator(expression, this); + return ExpressionVisitors.visitEvaluator(signedUuidMode ? signedUuidExpr : expr, this); } /** diff --git a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java index 7d8da8a477d8..4298bc247191 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java @@ -73,10 +73,9 @@ private ManifestEvaluator(PartitionSpec spec, Expression partitionFilter, boolea this.expr = Binder.bind(partitionType, rewritten, caseSensitive); // Create the signed UUID expression iff there are UUID predicates that compare against bounds. + Expression transformed = ExpressionUtil.toSignedUUIDLiteral(rewritten); this.signedUuidExpr = - ExpressionUtil.toSignedUUIDLiteral(rewritten) - .map(transformed -> Binder.bind(partitionType, transformed, caseSensitive)) - .orElse(null); + transformed != null ? Binder.bind(partitionType, transformed, caseSensitive) : null; } /** @@ -86,7 +85,7 @@ private ManifestEvaluator(PartitionSpec spec, Expression partitionFilter, boolea * @return false if the file cannot contain rows that match the expression, true otherwise. */ public boolean eval(ManifestFile manifest) { - boolean result = new ManifestEvalVisitor().eval(manifest, expr, false); + boolean result = new ManifestEvalVisitor().eval(manifest, false); // If the RFC-compliant evaluation says rows might match, or there's no signed UUID expression, // return the result. @@ -96,7 +95,7 @@ public boolean eval(ManifestFile manifest) { // Always try with signed UUID comparator as a fallback. There is no reliable way to detect // which comparator was used when the manifest's partition field summaries were written. - return new ManifestEvalVisitor().eval(manifest, signedUuidExpr, true); + return new ManifestEvalVisitor().eval(manifest, true); } private static final boolean ROWS_MIGHT_MATCH = true; @@ -109,14 +108,14 @@ private class ManifestEvalVisitor extends BoundExpressionVisitor { // when binding converts literals to a Set of raw values. private boolean useSignedUuidComparator = false; - private boolean eval(ManifestFile manifest, Expression expression, boolean signedUuidMode) { + private boolean eval(ManifestFile manifest, boolean signedUuidMode) { this.stats = manifest.partitions(); this.useSignedUuidComparator = signedUuidMode; if (stats == null) { return ROWS_MIGHT_MATCH; } - return ExpressionVisitors.visitEvaluator(expression, this); + return ExpressionVisitors.visitEvaluator(signedUuidMode ? signedUuidExpr : expr, this); } /** diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java index 163e4a6642c2..5c21885dd0d2 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java @@ -29,7 +29,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Optional; import java.util.UUID; import java.util.regex.Pattern; import java.util.stream.IntStream; @@ -1361,7 +1360,7 @@ public void testToSignedUUIDLiteralDetectsUuidLt() { assertThat(ExpressionUtil.toSignedUUIDLiteral(original)) .as("Should detect UUID lt predicate") - .isPresent(); + .isNotNull(); } @Test @@ -1370,7 +1369,7 @@ public void testToSignedUUIDLiteralDetectsUuidEq() { assertThat(ExpressionUtil.toSignedUUIDLiteral(original)) .as("Should detect UUID eq predicate") - .isPresent(); + .isNotNull(); } @Test @@ -1379,7 +1378,7 @@ public void testToSignedUUIDLiteralDetectsUuidIn() { assertThat(ExpressionUtil.toSignedUUIDLiteral(original)) .as("Should detect UUID in predicate") - .isPresent(); + .isNotNull(); } @Test @@ -1388,7 +1387,7 @@ public void testToSignedUUIDLiteralNoDetectionForNonUuid() { assertThat(ExpressionUtil.toSignedUUIDLiteral(original)) .as("Should not detect non-UUID predicate") - .isEmpty(); + .isNull(); } @Test @@ -1397,7 +1396,7 @@ public void testToSignedUUIDLiteralNoDetectionForIsNull() { assertThat(ExpressionUtil.toSignedUUIDLiteral(original)) .as("Should not detect UUID isNull predicate (doesn't compare against bounds)") - .isEmpty(); + .isNull(); } @Test @@ -1408,7 +1407,7 @@ public void testToSignedUUIDLiteralDetectsInCompoundExpression() { assertThat(ExpressionUtil.toSignedUUIDLiteral(original)) .as("Should detect UUID predicate in compound expression") - .isPresent(); + .isNotNull(); } @Test @@ -1416,10 +1415,10 @@ public void testToSignedUUIDLiteralTransformsLiteral() { UUID testUuid = UUID.randomUUID(); Expression original = Expressions.equal("uuid_col", testUuid); - Optional result = ExpressionUtil.toSignedUUIDLiteral(original); - assertThat(result).isPresent(); - assertThat(result.get()).isInstanceOf(UnboundPredicate.class); - UnboundPredicate pred = (UnboundPredicate) result.get(); + Expression result = ExpressionUtil.toSignedUUIDLiteral(original); + assertThat(result).isNotNull(); + assertThat(result).isInstanceOf(UnboundPredicate.class); + UnboundPredicate pred = (UnboundPredicate) result; assertThat(pred.literal().value()).isEqualTo(testUuid); // The literal should now use the signed comparator assertThat(pred.literal()).isInstanceOf(Literals.UUIDLiteral.class); @@ -1431,10 +1430,10 @@ public void testToSignedUUIDLiteralTransformsInPredicate() { UUID uuid2 = UUID.randomUUID(); Expression original = Expressions.in("uuid_col", uuid1, uuid2); - Optional result = ExpressionUtil.toSignedUUIDLiteral(original); - assertThat(result).isPresent(); - assertThat(result.get()).isInstanceOf(UnboundPredicate.class); - UnboundPredicate pred = (UnboundPredicate) result.get(); + Expression result = ExpressionUtil.toSignedUUIDLiteral(original); + assertThat(result).isNotNull(); + assertThat(result).isInstanceOf(UnboundPredicate.class); + UnboundPredicate pred = (UnboundPredicate) result; assertThat(pred.literals()).hasSize(2); } @@ -1445,10 +1444,10 @@ public void testToSignedUUIDLiteralTransformsCompoundExpression() { Expressions.and( Expressions.equal("id", 42L), Expressions.greaterThan("uuid_col", testUuid)); - Optional result = ExpressionUtil.toSignedUUIDLiteral(original); - assertThat(result).isPresent(); - assertThat(result.get()).isInstanceOf(And.class); - And and = (And) result.get(); + Expression result = ExpressionUtil.toSignedUUIDLiteral(original); + assertThat(result).isNotNull(); + assertThat(result).isInstanceOf(And.class); + And and = (And) result; // Both children should be transformed assertThat(and.left()).isInstanceOf(UnboundPredicate.class); assertThat(and.right()).isInstanceOf(UnboundPredicate.class); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java index b5faa9c1d58f..4c9b960c1c0a 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java @@ -67,10 +67,9 @@ public ParquetMetricsRowGroupFilter(Schema schema, Expression unbound, boolean c this.expr = Binder.bind(struct, rewritten, caseSensitive); // Create the signed UUID expression iff there are UUID predicates that compare against bounds. + Expression transformed = ExpressionUtil.toSignedUUIDLiteral(rewritten); this.signedUuidExpr = - ExpressionUtil.toSignedUUIDLiteral(rewritten) - .map(transformed -> Binder.bind(struct, transformed, caseSensitive)) - .orElse(null); + transformed != null ? Binder.bind(struct, transformed, caseSensitive) : null; } /** @@ -81,7 +80,7 @@ public ParquetMetricsRowGroupFilter(Schema schema, Expression unbound, boolean c * @return false if the file cannot contain rows that match the expression, true otherwise. */ public boolean shouldRead(MessageType fileSchema, BlockMetaData rowGroup) { - boolean result = new MetricsEvalVisitor().eval(fileSchema, rowGroup, expr, false); + boolean result = new MetricsEvalVisitor().eval(fileSchema, rowGroup, false); // If the RFC-compliant evaluation says rows might match, or there's no signed UUID expression, // return the result. @@ -93,7 +92,7 @@ public boolean shouldRead(MessageType fileSchema, BlockMetaData rowGroup) { // which comparator was used when the file's column metrics were written. // The signedUuidExpr has literals with signed comparators for lt/gt/eq predicates. // For IN predicates, signed comparator is used via MetricsEvalVisitor#comparatorForIn. - return new MetricsEvalVisitor().eval(fileSchema, rowGroup, signedUuidExpr, true); + return new MetricsEvalVisitor().eval(fileSchema, rowGroup, true); } private static final boolean ROWS_MIGHT_MATCH = true; @@ -108,11 +107,7 @@ private class MetricsEvalVisitor extends BoundExpressionVisitor { // when binding converts literals to a Set of raw values. private boolean useSignedUuidComparator = false; - private boolean eval( - MessageType fileSchema, - BlockMetaData rowGroup, - Expression expression, - boolean signedUuidMode) { + private boolean eval(MessageType fileSchema, BlockMetaData rowGroup, boolean signedUuidMode) { if (rowGroup.getRowCount() <= 0) { return ROWS_CANNOT_MATCH; } @@ -132,7 +127,7 @@ private boolean eval( } } - return ExpressionVisitors.visitEvaluator(expression, this); + return ExpressionVisitors.visitEvaluator(signedUuidMode ? signedUuidExpr : expr, this); } /** From 9af9c5ade107054f99e4186ea3996f7bbd52c6e4 Mon Sep 17 00:00:00 2001 From: bodduv Date: Thu, 22 Jan 2026 17:21:48 +0100 Subject: [PATCH 05/11] Consolidate comparatorForIn into a static method --- .../InclusiveMetricsEvaluator.java | 20 ++++-------------- .../expressions/ManifestEvaluator.java | 18 +++------------- .../org/apache/iceberg/types/Comparators.java | 18 ++++++++++++++++ .../parquet/ParquetMetricsRowGroupFilter.java | 21 ++++--------------- 4 files changed, 29 insertions(+), 48 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java index 6354efc586f5..b0977e90d6b3 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java @@ -32,7 +32,6 @@ import org.apache.iceberg.transforms.Transform; import org.apache.iceberg.types.Comparators; import org.apache.iceberg.types.Conversions; -import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.util.NaNUtil; import org.apache.iceberg.variants.Variant; @@ -134,20 +133,6 @@ private boolean eval(ContentFile file, boolean signedUuidMode) { return ExpressionVisitors.visitEvaluator(signedUuidMode ? signedUuidExpr : expr, this); } - /** - * Returns the appropriate comparator for the given term. This is needed for the IN predicate - * because the comparator information is lost when binding converts literals to a Set of raw - * values. For other predicates, the literal carries the comparator. - */ - @SuppressWarnings("unchecked") - private Comparator comparatorForIn(Bound term) { - if (useSignedUuidComparator && term.ref().type().typeId() == Type.TypeID.UUID) { - return (Comparator) Comparators.signedUUIDs(); - } - - return ((BoundTerm) term).comparator(); - } - @Override public Boolean alwaysTrue() { return ROWS_MIGHT_MATCH; // all rows match @@ -399,7 +384,10 @@ public Boolean in(Bound term, Set literalSet) { return ROWS_MIGHT_MATCH; } - Comparator cmp = comparatorForIn(term); + // use an appropriate comparator, for UUIDs use the signed comparator if requested + Comparator cmp = + Comparators.comparatorFor( + term.ref().type(), ((BoundTerm) term).comparator(), useSignedUuidComparator); literals = literals.stream().filter(v -> cmp.compare(lower, v) <= 0).collect(Collectors.toList()); // if all values are less than lower bound, rows cannot match diff --git a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java index 4298bc247191..f7c797ddcc33 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java @@ -118,20 +118,6 @@ private boolean eval(ManifestFile manifest, boolean signedUuidMode) { return ExpressionVisitors.visitEvaluator(signedUuidMode ? signedUuidExpr : expr, this); } - /** - * Returns the appropriate comparator for the given reference. This is needed for the IN - * predicate because the comparator information is lost when binding converts literals to a Set - * of raw values. For other predicates, the literal carries the comparator. - */ - @SuppressWarnings("unchecked") - private Comparator comparatorForIn(BoundReference ref) { - if (useSignedUuidComparator && ref.type().typeId() == Type.TypeID.UUID) { - return (Comparator) Comparators.signedUUIDs(); - } - - return ref.comparator(); - } - @Override public Boolean alwaysTrue() { return ROWS_MIGHT_MATCH; // all rows match @@ -335,7 +321,9 @@ public Boolean in(BoundReference ref, Set literalSet) { return ROWS_MIGHT_MATCH; } - Comparator cmp = comparatorForIn(ref); + // use an appropriate comparator, for UUIDs use the signed comparator if requested + Comparator cmp = + Comparators.comparatorFor(ref.type(), ref.comparator(), useSignedUuidComparator); T lower = Conversions.fromByteBuffer(ref.type(), fieldStats.lowerBound()); literals = literals.stream().filter(v -> cmp.compare(lower, v) <= 0).collect(Collectors.toList()); diff --git a/api/src/main/java/org/apache/iceberg/types/Comparators.java b/api/src/main/java/org/apache/iceberg/types/Comparators.java index 79aa36165bd3..1c0e402c8173 100644 --- a/api/src/main/java/org/apache/iceberg/types/Comparators.java +++ b/api/src/main/java/org/apache/iceberg/types/Comparators.java @@ -251,6 +251,24 @@ public static Comparator signedUUIDs() { return Comparator.naturalOrder(); } + /** + * Returns the given comparator for a type or the signed UUID comparator, conditionally. + * + * @param type the Iceberg type + * @param defaultComparator the default comparator to use for non-UUID types or when signed UUID + * comparison is not requested + * @param useSignedUuid if true and the type is UUID, returns a signed UUID comparator + * @return the appropriate comparator + */ + @SuppressWarnings("unchecked") + public static Comparator comparatorFor( + Type type, Comparator defaultComparator, boolean useSignedUuid) { + if (useSignedUuid && type.typeId() == Type.TypeID.UUID) { + return (Comparator) signedUUIDs(); + } + return defaultComparator; + } + private static class NullsFirst implements Comparator { private static final NullsFirst INSTANCE = new NullsFirst<>(); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java index 4c9b960c1c0a..658e7e359711 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java @@ -91,7 +91,7 @@ public boolean shouldRead(MessageType fileSchema, BlockMetaData rowGroup) { // Try again with signed UUID comparator. There is no quick way to detect // which comparator was used when the file's column metrics were written. // The signedUuidExpr has literals with signed comparators for lt/gt/eq predicates. - // For IN predicates, signed comparator is used via MetricsEvalVisitor#comparatorForIn. + // For IN predicates, signed comparator is used via Comparators#comparatorFor. return new MetricsEvalVisitor().eval(fileSchema, rowGroup, true); } @@ -130,21 +130,6 @@ private boolean eval(MessageType fileSchema, BlockMetaData rowGroup, boolean sig return ExpressionVisitors.visitEvaluator(signedUuidMode ? signedUuidExpr : expr, this); } - /** - * Returns the appropriate comparator for the given reference. This is needed for the IN - * predicate because the comparator information is lost when binding converts literals to a Set - * of raw values. For other predicates, the literal carries the comparator. - */ - @SuppressWarnings("unchecked") - private Comparator comparatorForIn(BoundReference ref) { - if (useSignedUuidComparator - && ref.type().typeId() == org.apache.iceberg.types.Type.TypeID.UUID) { - return (Comparator) Comparators.signedUUIDs(); - } - - return ref.comparator(); - } - @Override public Boolean alwaysTrue() { return ROWS_MIGHT_MATCH; // all rows match @@ -448,7 +433,9 @@ public Boolean in(BoundReference ref, Set literalSet) { return ROWS_MIGHT_MATCH; } - Comparator cmp = comparatorForIn(ref); + // use an appropriate comparator, for UUIDs use the signed comparator if requested + Comparator cmp = + Comparators.comparatorFor(ref.type(), ref.comparator(), useSignedUuidComparator); T lower = min(colStats, id); literals = literals.stream().filter(v -> cmp.compare(lower, v) <= 0).collect(Collectors.toList()); From b625b157c8b3a4de8c832d6ab9a46ee9092aeb78 Mon Sep 17 00:00:00 2001 From: bodduv Date: Fri, 23 Jan 2026 09:27:56 +0100 Subject: [PATCH 06/11] Fix iff; add new lines after block; fix TestInclusiveManifestEvaluator conflicts --- .../apache/iceberg/expressions/ExpressionUtil.java | 3 +-- .../expressions/InclusiveMetricsEvaluator.java | 3 ++- .../iceberg/expressions/ManifestEvaluator.java | 3 ++- .../java/org/apache/iceberg/types/Comparators.java | 1 + .../TestInclusiveManifestEvaluator.java | 14 ++++++++++++++ .../parquet/ParquetMetricsRowGroupFilter.java | 3 ++- 6 files changed, 22 insertions(+), 5 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java index ccd8182c8c27..c7959a96b6e5 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java @@ -757,8 +757,7 @@ private static PartitionSpec identitySpec(Schema schema, int... ids) { * literals to a Set of raw values, so the evaluator must handle this separately. * * @param expr an unbound expression - * @return Optional containing the transformed expression if UUID bounds predicates exist, or - * empty if no UUID bounds predicates were found + * @return the expression with signed UUID comparators, or null if no UUID predicates are present */ @Nullable public static Expression toSignedUUIDLiteral(Expression expr) { diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java index b0977e90d6b3..ddba77fa8606 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java @@ -70,7 +70,8 @@ public InclusiveMetricsEvaluator(Schema schema, Expression unbound, boolean case Expression rewritten = rewriteNot(unbound); this.expr = Binder.bind(struct, rewritten, caseSensitive); - // Create the signed UUID expression iff there are UUID predicates that compare against bounds. + // Create the signed UUID expression if and only if there are UUID predicates + // that compare against bounds. Expression transformed = ExpressionUtil.toSignedUUIDLiteral(rewritten); this.signedUuidExpr = transformed != null ? Binder.bind(struct, transformed, caseSensitive) : null; diff --git a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java index f7c797ddcc33..4fa1104806bb 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java @@ -72,7 +72,8 @@ private ManifestEvaluator(PartitionSpec spec, Expression partitionFilter, boolea Expression rewritten = rewriteNot(partitionFilter); this.expr = Binder.bind(partitionType, rewritten, caseSensitive); - // Create the signed UUID expression iff there are UUID predicates that compare against bounds. + // Create the signed UUID expression if and only if there are UUID predicates + // that compare against bounds. Expression transformed = ExpressionUtil.toSignedUUIDLiteral(rewritten); this.signedUuidExpr = transformed != null ? Binder.bind(partitionType, transformed, caseSensitive) : null; diff --git a/api/src/main/java/org/apache/iceberg/types/Comparators.java b/api/src/main/java/org/apache/iceberg/types/Comparators.java index 1c0e402c8173..a1821eee7332 100644 --- a/api/src/main/java/org/apache/iceberg/types/Comparators.java +++ b/api/src/main/java/org/apache/iceberg/types/Comparators.java @@ -266,6 +266,7 @@ public static Comparator comparatorFor( if (useSignedUuid && type.typeId() == Type.TypeID.UUID) { return (Comparator) signedUUIDs(); } + return defaultComparator; } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java index 3be7f60d4606..d40fe8d25444 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java @@ -1101,6 +1101,20 @@ public void testUuidIn() { new TestHelpers.TestFieldSummary(true, null, null), new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MIN), new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN), + new TestHelpers.TestFieldSummary( + false, + true, + toByteBuffer(Types.FloatType.get(), 5.0F), + toByteBuffer(Types.FloatType.get(), 5.0F)), + new TestHelpers.TestFieldSummary( + false, + toByteBuffer(Types.FloatType.get(), 5.0F), + toByteBuffer(Types.FloatType.get(), 5.0F)), + new TestHelpers.TestFieldSummary( + false, + false, + toByteBuffer(Types.FloatType.get(), 5.0F), + toByteBuffer(Types.FloatType.get(), 5.0F)), new TestHelpers.TestFieldSummary( false, toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_MIN), diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java index 658e7e359711..cbda6d1fdc18 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java @@ -66,7 +66,8 @@ public ParquetMetricsRowGroupFilter(Schema schema, Expression unbound, boolean c Expression rewritten = Expressions.rewriteNot(unbound); this.expr = Binder.bind(struct, rewritten, caseSensitive); - // Create the signed UUID expression iff there are UUID predicates that compare against bounds. + // Create the signed UUID expression if and only if there are UUID predicates + // that compare against bounds. Expression transformed = ExpressionUtil.toSignedUUIDLiteral(rewritten); this.signedUuidExpr = transformed != null ? Binder.bind(struct, transformed, caseSensitive) : null; From ab000b3dc365f33573763008a21c0671c3ffa5b0 Mon Sep 17 00:00:00 2001 From: bodduv Date: Fri, 30 Jan 2026 16:20:00 +0100 Subject: [PATCH 07/11] Address comments for test files --- .../expressions/TestExpressionUtil.java | 126 ++++++++++-------- .../TestStrictMetricsEvaluator.java | 94 ++++++------- .../data/TestMetricsRowGroupFilter.java | 22 +-- 3 files changed, 132 insertions(+), 110 deletions(-) diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java index 5c21885dd0d2..7ef47ead3bab 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java @@ -45,6 +45,7 @@ import org.apache.iceberg.variants.VariantPrimitive; import org.apache.iceberg.variants.VariantTestUtil; import org.apache.iceberg.variants.VariantValue; +import org.assertj.core.api.InstanceOfAssertFactories; import org.junit.jupiter.api.Test; public class TestExpressionUtil { @@ -1355,73 +1356,68 @@ private void assertEquals(UnboundTransform expected, UnboundTransform predicate = (UnboundPredicate) result; + assertThat(predicate.literal().value()).isEqualTo(testUuid); + // The literal should now use the signed comparator + assertThat(predicate.literal()) + .isInstanceOf(Literals.UUIDLiteral.class) + .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class)) + .extracting("useSignedComparator") + .isEqualTo(true); } @Test - public void testToSignedUUIDLiteralDetectsInCompoundExpression() { - Expression original = - Expressions.and( - Expressions.equal("id", 42L), Expressions.greaterThan("uuid_col", UUID.randomUUID())); + public void testToSignedUUIDLiteralTransformsLtPredicate() { + UUID testUuid = UUID.randomUUID(); + Expression original = Expressions.lessThan("uuid_col", testUuid); - assertThat(ExpressionUtil.toSignedUUIDLiteral(original)) - .as("Should detect UUID predicate in compound expression") - .isNotNull(); + Expression result = ExpressionUtil.toSignedUUIDLiteral(original); + assertThat(result).isNotNull().isInstanceOf(UnboundPredicate.class); + UnboundPredicate predicate = (UnboundPredicate) result; + assertThat(predicate.literal()) + .isInstanceOf(Literals.UUIDLiteral.class) + .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class)) + .extracting("useSignedComparator") + .isEqualTo(true); } @Test - public void testToSignedUUIDLiteralTransformsLiteral() { + public void testToSignedUUIDLiteralTransformsGtPredicate() { UUID testUuid = UUID.randomUUID(); - Expression original = Expressions.equal("uuid_col", testUuid); + Expression original = Expressions.greaterThan("uuid_col", testUuid); Expression result = ExpressionUtil.toSignedUUIDLiteral(original); - assertThat(result).isNotNull(); - assertThat(result).isInstanceOf(UnboundPredicate.class); - UnboundPredicate pred = (UnboundPredicate) result; - assertThat(pred.literal().value()).isEqualTo(testUuid); - // The literal should now use the signed comparator - assertThat(pred.literal()).isInstanceOf(Literals.UUIDLiteral.class); + assertThat(result).isNotNull().isInstanceOf(UnboundPredicate.class); + UnboundPredicate predicate = (UnboundPredicate) result; + assertThat(predicate.literal()) + .isInstanceOf(Literals.UUIDLiteral.class) + .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class)) + .extracting("useSignedComparator") + .isEqualTo(true); } @Test @@ -1431,10 +1427,20 @@ public void testToSignedUUIDLiteralTransformsInPredicate() { Expression original = Expressions.in("uuid_col", uuid1, uuid2); Expression result = ExpressionUtil.toSignedUUIDLiteral(original); - assertThat(result).isNotNull(); - assertThat(result).isInstanceOf(UnboundPredicate.class); - UnboundPredicate pred = (UnboundPredicate) result; - assertThat(pred.literals()).hasSize(2); + assertThat(result).isNotNull().isInstanceOf(UnboundPredicate.class); + UnboundPredicate predicate = (UnboundPredicate) result; + assertThat(predicate.literals()).hasSize(2); + // All literals should use the signed comparator + predicate + .literals() + .forEach( + lit -> { + assertThat(lit) + .isInstanceOf(Literals.UUIDLiteral.class) + .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class)) + .extracting("useSignedComparator") + .isEqualTo(true); + }); } @Test @@ -1445,11 +1451,23 @@ public void testToSignedUUIDLiteralTransformsCompoundExpression() { Expressions.equal("id", 42L), Expressions.greaterThan("uuid_col", testUuid)); Expression result = ExpressionUtil.toSignedUUIDLiteral(original); - assertThat(result).isNotNull(); - assertThat(result).isInstanceOf(And.class); + assertThat(result).isNotNull().isInstanceOf(And.class); And and = (And) result; - // Both children should be transformed + + // Verify the non-UUID predicate is unchanged assertThat(and.left()).isInstanceOf(UnboundPredicate.class); + UnboundPredicate leftPredicate = (UnboundPredicate) and.left(); + assertThat(leftPredicate.literal()) + .isInstanceOf(Literals.LongLiteral.class) + .extracting("value") + .isEqualTo(42L); + + // Verify the UUID predicate is transformed with signed comparator assertThat(and.right()).isInstanceOf(UnboundPredicate.class); + UnboundPredicate rightPredicate = (UnboundPredicate) and.right(); + assertThat(rightPredicate.literal()) + .isInstanceOf(Literals.UUIDLiteral.class) + .extracting("useSignedComparator") + .isEqualTo(true); } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java index 02ef5662b261..5508fb173aaa 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java @@ -174,6 +174,53 @@ public class TestStrictMetricsEvaluator { // upper bounds ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb"))); + // UUIDs for testing dual-comparator behavior with high-bit values + // These UUIDs span the signed/unsigned comparison boundary + private static final UUID UUID_MIN = UUID.fromString("00000000-0000-0000-0000-000000000001"); + private static final UUID UUID_MAX = UUID.fromString("80000000-0000-0000-0000-000000000001"); + + private static final DataFile UUID_FILE = + new TestDataFile( + "uuid_file.avro", + Row.of(), + 50, + ImmutableMap.of(18, 50L), + ImmutableMap.of(18, 0L), + null, + ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), UUID_MIN)), + ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), UUID_MAX))); + + // File with single UUID value (lower == upper) + private static final UUID SINGLE_UUID = UUID.fromString("40000000-0000-0000-0000-000000000001"); + private static final DataFile SINGLE_UUID_FILE = + new TestDataFile( + "single_uuid_file.avro", + Row.of(), + 50, + ImmutableMap.of(18, 50L), + ImmutableMap.of(18, 0L), + null, + ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), SINGLE_UUID)), + ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), SINGLE_UUID))); + + // File with inverted UUID bounds (as would be written by legacy signed comparator) + // In RFC unsigned order: 0x40... < 0x80..., so lower > upper when interpreted with RFC + private static final UUID LEGACY_UUID_LOWER = + UUID.fromString("80000000-0000-0000-0000-000000000001"); + private static final UUID LEGACY_UUID_UPPER = + UUID.fromString("40000000-0000-0000-0000-000000000001"); + + private static final DataFile LEGACY_UUID_FILE = + new TestDataFile( + "legacy_uuid_file.avro", + Row.of(), + 50, + ImmutableMap.of(18, 50L), + ImmutableMap.of(18, 0L), + null, + ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_LOWER)), + ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_UPPER))); + @Test public void testAllNulls() { boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notNull("all_nulls")).eval(FILE); @@ -689,20 +736,6 @@ SCHEMA, lessThanOrEqual("struct.nested_col_with_stats", INT_MAX_VALUE)) // Tests for UUID with StrictMetricsEvaluator using RFC-compliant comparison only - private static final UUID UUID_MIN = UUID.fromString("00000000-0000-0000-0000-000000000001"); - private static final UUID UUID_MAX = UUID.fromString("80000000-0000-0000-0000-000000000001"); - - private static final DataFile UUID_FILE = - new TestDataFile( - "uuid_file.avro", - Row.of(), - 50, - ImmutableMap.of(18, 50L), - ImmutableMap.of(18, 0L), - null, - ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), UUID_MIN)), - ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), UUID_MAX))); - @Test public void testStrictUuidGt() { // Query: uuid > 0x00... (all UUIDs in file should be > this) @@ -741,19 +774,6 @@ public void testStrictUuidInNeverMatchesRange() { assertThat(allMatch).as("Strict IN should not match range").isFalse(); } - // File with single UUID value (lower == upper) - private static final UUID SINGLE_UUID = UUID.fromString("40000000-0000-0000-0000-000000000001"); - private static final DataFile SINGLE_UUID_FILE = - new TestDataFile( - "single_uuid_file.avro", - Row.of(), - 50, - ImmutableMap.of(18, 50L), - ImmutableMap.of(18, 0L), - null, - ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), SINGLE_UUID)), - ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), SINGLE_UUID))); - @Test public void testStrictUuidInMatchesSingleValue() { // Strict IN should match when lower == upper and the value is in the set @@ -792,25 +812,9 @@ public void testStrictUuidNotInDoesNotMatchWhenValueInBounds() { } // Tests for file with inverted UUID bounds (as would be written by legacy signed comparator) - // In RFC unsigned order: 0x40... < 0x80..., so lower > upper when interpreted with RFC - private static final UUID LEGACY_UUID_LOWER = - UUID.fromString("80000000-0000-0000-0000-000000000001"); - private static final UUID LEGACY_UUID_UPPER = - UUID.fromString("40000000-0000-0000-0000-000000000001"); - - private static final DataFile LEGACY_UUID_FILE = - new TestDataFile( - "legacy_uuid_file.avro", - Row.of(), - 50, - ImmutableMap.of(18, 50L), - ImmutableMap.of(18, 0L), - null, - ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_LOWER)), - ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_UPPER))); @Test - public void testStrictUuidInWithInvertedBounds() { + public void testStrictUuidInWithLegacyInvertedBounds() { // With inverted bounds [0x80..., 0x40...] where lower > upper in RFC order, // strict IN should never match since lower != upper UUID uuid1 = UUID.fromString("20000000-0000-0000-0000-000000000001"); @@ -821,7 +825,7 @@ public void testStrictUuidInWithInvertedBounds() { } @Test - public void testStrictUuidNotInWithInvertedBounds() { + public void testStrictUuidNotInWithLegacyInvertedBounds() { // Query: NOT IN (0x50..., 0x60...) // With inverted bounds [0x80..., 0x40...], in RFC order: 0x40... < 0x50... < 0x60... < 0x80... // The values 0x50... and 0x60... are between upper (0x40...) and lower (0x80...) in RFC order diff --git a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java index 284cd980f1a4..9437911c8902 100644 --- a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java +++ b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java @@ -162,6 +162,17 @@ public static List parameters() { private static final UUID UUID_WITH_ZEROS = UUID.fromString("00000000-0000-0000-0000-000000000000"); + // UUIDs for testing dual-comparator behavior with high-bit values + // These UUIDs span the signed/unsigned comparison boundary + private static final UUID UUID_LOW = UUID.fromString("00000000-0000-0000-0000-000000000001"); + private static final UUID UUID_MID = UUID.fromString("40000000-0000-0000-0000-000000000001"); + private static final UUID UUID_HIGH = UUID.fromString("80000000-0000-0000-0000-000000000001"); + private static final UUID UUID_HIGHER = UUID.fromString("c0000000-0000-0000-0000-000000000001"); + + private static final Schema UUID_SCHEMA = + new Schema( + required(1, "id", IntegerType.get()), optional(2, "uuid_col", Types.UUIDType.get())); + private File orcFile = null; private MessageType parquetSchema = null; private BlockMetaData rowGroupMetadata = null; @@ -1138,17 +1149,6 @@ public void testUUID() { .isTrue(); } - // UUIDs for testing dual-comparator behavior with high-bit values - // These UUIDs span the signed/unsigned comparison boundary - private static final UUID UUID_LOW = UUID.fromString("00000000-0000-0000-0000-000000000001"); - private static final UUID UUID_MID = UUID.fromString("40000000-0000-0000-0000-000000000001"); - private static final UUID UUID_HIGH = UUID.fromString("80000000-0000-0000-0000-000000000001"); - private static final UUID UUID_HIGHER = UUID.fromString("c0000000-0000-0000-0000-000000000001"); - - private static final Schema UUID_SCHEMA = - new Schema( - required(1, "id", IntegerType.get()), optional(2, "uuid_col", Types.UUIDType.get())); - /** * Tests UUID filtering with values that span the signed/unsigned comparison boundary. In RFC 9562 * unsigned comparison: UUID_LOW < UUID_MID < UUID_HIGH < UUID_HIGHER In legacy signed comparison: From e831236c04c4856576edcf738515cc9a503fecc3 Mon Sep 17 00:00:00 2001 From: bodduv Date: Tue, 3 Feb 2026 12:01:22 +0100 Subject: [PATCH 08/11] Drop transient cached cmp --- .../java/org/apache/iceberg/expressions/Literals.java | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/Literals.java b/api/src/main/java/org/apache/iceberg/expressions/Literals.java index 82a69019afc1..02b1a71667b2 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Literals.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Literals.java @@ -616,8 +616,6 @@ static class UUIDLiteral extends BaseLiteral { // Flag to indicate which comparator to use (serializable) private final boolean useSignedComparator; - // Transient cached comparator (reconstructed on deserialization) - private transient volatile Comparator cmp; UUIDLiteral(UUID value) { this(value, false); @@ -639,11 +637,7 @@ public Literal to(Type type) { @Override public Comparator comparator() { - if (cmp == null) { - cmp = useSignedComparator ? SIGNED_CMP : RFC_CMP; - } - - return cmp; + return useSignedComparator ? SIGNED_CMP : RFC_CMP; } @Override From e428981ec821538ed594008acb83b2305944c303 Mon Sep 17 00:00:00 2001 From: bodduv Date: Fri, 6 Feb 2026 09:53:06 +0100 Subject: [PATCH 09/11] Refactor ExpressionUtil.predicate --- .../iceberg/expressions/ExpressionUtil.java | 32 ++++--------------- 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java index c7959a96b6e5..80dd4466f629 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java @@ -814,34 +814,19 @@ public Expression predicate(BoundPredicate pred) { @Override @SuppressWarnings("unchecked") public Expression predicate(UnboundPredicate pred) { - UnboundTerm term = pred.term(); - switch (pred.op()) { - case IS_NULL: - case NOT_NULL: - case IS_NAN: - case NOT_NAN: - // Unary predicates don't have literals to transform - return pred; - case LT: case LT_EQ: case GT: case GT_EQ: case EQ: case NOT_EQ: - Literal lit = pred.literal(); - if (lit.value() instanceof UUID) { + if (pred.literal().value() instanceof UUID) { foundUUIDBoundsPredicate = true; - Literals.UUIDLiteral uuidLit = (Literals.UUIDLiteral) lit; - return new UnboundPredicate<>(pred.op(), term, (T) uuidLit.withSignedComparator()); + Literals.UUIDLiteral uuidLit = (Literals.UUIDLiteral) pred.literal(); + return new UnboundPredicate<>( + pred.op(), pred.term(), (T) uuidLit.withSignedComparator()); } - - return pred; - - case STARTS_WITH: - case NOT_STARTS_WITH: - // These operations don't apply to UUIDs, no transformation needed return pred; case IN: @@ -851,15 +836,10 @@ public Expression predicate(UnboundPredicate pred) { foundUUIDBoundsPredicate = true; List transformedValues = literals.stream() - .map( - l -> { - Literals.UUIDLiteral uuidLit = (Literals.UUIDLiteral) l; - return (T) uuidLit.withSignedComparator(); - }) + .map(l -> (T) ((Literals.UUIDLiteral) l).withSignedComparator()) .collect(Collectors.toList()); - return new UnboundPredicate<>(pred.op(), term, transformedValues); + return new UnboundPredicate<>(pred.op(), pred.term(), transformedValues); } - return pred; default: From 8c9552094c9c6abd75017ee5eb85fcfc88c0967d Mon Sep 17 00:00:00 2001 From: bodduv Date: Wed, 11 Feb 2026 11:41:19 +0100 Subject: [PATCH 10/11] Make MetricsEvalVisitor static; address doc related comments --- .../expressions/InclusiveMetricsEvaluator.java | 12 ++++++------ .../org/apache/iceberg/expressions/Literals.java | 7 +++++-- .../iceberg/expressions/ManifestEvaluator.java | 12 ++++++------ .../parquet/ParquetMetricsRowGroupFilter.java | 2 +- 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java index ddba77fa8606..5a6c52df4e49 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java @@ -85,7 +85,7 @@ public InclusiveMetricsEvaluator(Schema schema, Expression unbound, boolean case */ public boolean eval(ContentFile file) { // TODO: detect the case where a column is missing from the file using file's max field id. - boolean result = new MetricsEvalVisitor().eval(file, false); + boolean result = new MetricsEvalVisitor().eval(file, expr, false); // If the RFC-compliant evaluation says rows might match, or there's no signed UUID expression, // return the result. @@ -94,14 +94,14 @@ public boolean eval(ContentFile file) { } // Always try with signed UUID comparator as a fallback. There is no reliable way to detect - // which comparator was used when the file's column metrics were written. - return new MetricsEvalVisitor().eval(file, true); + // whether signed or unsigned comparator was used when the UUID column stats were written. + return new MetricsEvalVisitor().eval(file, signedUuidExpr, true); } private static final boolean ROWS_MIGHT_MATCH = true; private static final boolean ROWS_CANNOT_MATCH = false; - private class MetricsEvalVisitor extends ExpressionVisitors.BoundVisitor { + private static class MetricsEvalVisitor extends ExpressionVisitors.BoundVisitor { private Map valueCounts = null; private Map nullCounts = null; private Map nanCounts = null; @@ -112,7 +112,7 @@ private class MetricsEvalVisitor extends ExpressionVisitors.BoundVisitor of raw values. private boolean useSignedUuidComparator = false; - private boolean eval(ContentFile file, boolean signedUuidMode) { + private boolean eval(ContentFile file, Expression expression, boolean signedUuidMode) { if (file.recordCount() == 0) { return ROWS_CANNOT_MATCH; } @@ -131,7 +131,7 @@ private boolean eval(ContentFile file, boolean signedUuidMode) { this.upperBounds = file.upperBounds(); this.useSignedUuidComparator = signedUuidMode; - return ExpressionVisitors.visitEvaluator(signedUuidMode ? signedUuidExpr : expr, this); + return ExpressionVisitors.visitEvaluator(expression, this); } @Override diff --git a/api/src/main/java/org/apache/iceberg/expressions/Literals.java b/api/src/main/java/org/apache/iceberg/expressions/Literals.java index 02b1a71667b2..a1d8a39772bd 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Literals.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Literals.java @@ -609,8 +609,11 @@ public String toString() { } static class UUIDLiteral extends BaseLiteral { - private static final Comparator RFC_CMP = + // unsigned byte-wise comparator per RFC 4122/9562 specification + private static final Comparator UNSIGNED_CMP = Comparators.nullsFirst().thenComparing(Comparators.uuids()); + // signed comparator for backward compatibility with files written before the introduction of + // RFC-compliant comparison private static final Comparator SIGNED_CMP = Comparators.nullsFirst().thenComparing(Comparators.signedUUIDs()); @@ -637,7 +640,7 @@ public Literal to(Type type) { @Override public Comparator comparator() { - return useSignedComparator ? SIGNED_CMP : RFC_CMP; + return useSignedComparator ? SIGNED_CMP : UNSIGNED_CMP; } @Override diff --git a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java index 4fa1104806bb..bd25b206c838 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java @@ -86,7 +86,7 @@ private ManifestEvaluator(PartitionSpec spec, Expression partitionFilter, boolea * @return false if the file cannot contain rows that match the expression, true otherwise. */ public boolean eval(ManifestFile manifest) { - boolean result = new ManifestEvalVisitor().eval(manifest, false); + boolean result = new ManifestEvalVisitor().eval(manifest, expr, false); // If the RFC-compliant evaluation says rows might match, or there's no signed UUID expression, // return the result. @@ -95,28 +95,28 @@ public boolean eval(ManifestFile manifest) { } // Always try with signed UUID comparator as a fallback. There is no reliable way to detect - // which comparator was used when the manifest's partition field summaries were written. - return new ManifestEvalVisitor().eval(manifest, true); + // whether signed or unsigned comparator was used when the UUID column stats were written. + return new ManifestEvalVisitor().eval(manifest, signedUuidExpr, true); } private static final boolean ROWS_MIGHT_MATCH = true; private static final boolean ROWS_CANNOT_MATCH = false; - private class ManifestEvalVisitor extends BoundExpressionVisitor { + private static class ManifestEvalVisitor extends BoundExpressionVisitor { private List stats = null; // Flag to use signed UUID comparator for backward compatibility. // This is needed for the IN predicate because the comparator information is lost // when binding converts literals to a Set of raw values. private boolean useSignedUuidComparator = false; - private boolean eval(ManifestFile manifest, boolean signedUuidMode) { + private boolean eval(ManifestFile manifest, Expression expression, boolean signedUuidMode) { this.stats = manifest.partitions(); this.useSignedUuidComparator = signedUuidMode; if (stats == null) { return ROWS_MIGHT_MATCH; } - return ExpressionVisitors.visitEvaluator(signedUuidMode ? signedUuidExpr : expr, this); + return ExpressionVisitors.visitEvaluator(expression, this); } @Override diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java index cbda6d1fdc18..b1ee2d46f135 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java @@ -90,7 +90,7 @@ public boolean shouldRead(MessageType fileSchema, BlockMetaData rowGroup) { } // Try again with signed UUID comparator. There is no quick way to detect - // which comparator was used when the file's column metrics were written. + // whether signed or unsigned comparator was used when the UUID column stats were written. // The signedUuidExpr has literals with signed comparators for lt/gt/eq predicates. // For IN predicates, signed comparator is used via Comparators#comparatorFor. return new MetricsEvalVisitor().eval(fileSchema, rowGroup, true); From e921039f78214b3133ecc076d45c0720f515a003 Mon Sep 17 00:00:00 2001 From: bodduv Date: Tue, 17 Feb 2026 13:20:16 +0100 Subject: [PATCH 11/11] Address comments; Bring all three metrics evaluators in sync; improve tests --- .../expressions/ManifestEvaluator.java | 4 +- .../expressions/TestExpressionUtil.java | 46 +++++ .../TestInclusiveManifestEvaluator.java | 164 ++++++++---------- .../TestInclusiveMetricsEvaluator.java | 89 ++++------ .../TestStrictMetricsEvaluator.java | 47 ++++- .../apache/iceberg/types/TestComparators.java | 43 ++++- .../data/TestMetricsRowGroupFilter.java | 126 +++++++++++++- .../parquet/ParquetMetricsRowGroupFilter.java | 23 ++- 8 files changed, 384 insertions(+), 158 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java index bd25b206c838..1a213e7a666d 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java @@ -105,8 +105,8 @@ public boolean eval(ManifestFile manifest) { private static class ManifestEvalVisitor extends BoundExpressionVisitor { private List stats = null; // Flag to use signed UUID comparator for backward compatibility. - // This is needed for the IN predicate because the comparator information is lost - // when binding converts literals to a Set of raw values. + // This is specifically necessary for IN predicates because the comparator information + // is lost when binding converts literals to a Set of raw values. private boolean useSignedUuidComparator = false; private boolean eval(ManifestFile manifest, Expression expression, boolean signedUuidMode) { diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java index 7ef47ead3bab..6373e73a12ec 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java @@ -1378,6 +1378,14 @@ public void testToSignedUUIDLiteralTransformsEqPredicate() { UUID testUuid = UUID.randomUUID(); Expression original = Expressions.equal("uuid_col", testUuid); + // Verify the original literal uses unsigned comparator (useSignedComparator = false) + UnboundPredicate originalPredicate = (UnboundPredicate) original; + assertThat(originalPredicate.literal()) + .isInstanceOf(Literals.UUIDLiteral.class) + .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class)) + .extracting("useSignedComparator") + .isEqualTo(false); + Expression result = ExpressionUtil.toSignedUUIDLiteral(original); assertThat(result).isNotNull().isInstanceOf(UnboundPredicate.class); UnboundPredicate predicate = (UnboundPredicate) result; @@ -1395,6 +1403,14 @@ public void testToSignedUUIDLiteralTransformsLtPredicate() { UUID testUuid = UUID.randomUUID(); Expression original = Expressions.lessThan("uuid_col", testUuid); + // Verify the original literal uses unsigned comparator (useSignedComparator = false) + UnboundPredicate originalPredicate = (UnboundPredicate) original; + assertThat(originalPredicate.literal()) + .isInstanceOf(Literals.UUIDLiteral.class) + .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class)) + .extracting("useSignedComparator") + .isEqualTo(false); + Expression result = ExpressionUtil.toSignedUUIDLiteral(original); assertThat(result).isNotNull().isInstanceOf(UnboundPredicate.class); UnboundPredicate predicate = (UnboundPredicate) result; @@ -1410,6 +1426,14 @@ public void testToSignedUUIDLiteralTransformsGtPredicate() { UUID testUuid = UUID.randomUUID(); Expression original = Expressions.greaterThan("uuid_col", testUuid); + // Verify the original literal uses unsigned comparator (useSignedComparator = false) + UnboundPredicate originalPredicate = (UnboundPredicate) original; + assertThat(originalPredicate.literal()) + .isInstanceOf(Literals.UUIDLiteral.class) + .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class)) + .extracting("useSignedComparator") + .isEqualTo(false); + Expression result = ExpressionUtil.toSignedUUIDLiteral(original); assertThat(result).isNotNull().isInstanceOf(UnboundPredicate.class); UnboundPredicate predicate = (UnboundPredicate) result; @@ -1426,6 +1450,19 @@ public void testToSignedUUIDLiteralTransformsInPredicate() { UUID uuid2 = UUID.randomUUID(); Expression original = Expressions.in("uuid_col", uuid1, uuid2); + // Verify the original literals use unsigned comparator (useSignedComparator = false) + UnboundPredicate originalPredicate = (UnboundPredicate) original; + originalPredicate + .literals() + .forEach( + lit -> { + assertThat(lit) + .isInstanceOf(Literals.UUIDLiteral.class) + .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class)) + .extracting("useSignedComparator") + .isEqualTo(false); + }); + Expression result = ExpressionUtil.toSignedUUIDLiteral(original); assertThat(result).isNotNull().isInstanceOf(UnboundPredicate.class); UnboundPredicate predicate = (UnboundPredicate) result; @@ -1450,6 +1487,15 @@ public void testToSignedUUIDLiteralTransformsCompoundExpression() { Expressions.and( Expressions.equal("id", 42L), Expressions.greaterThan("uuid_col", testUuid)); + // Verify the original UUID literal uses unsigned comparator (useSignedComparator = false) + And originalAnd = (And) original; + UnboundPredicate originalUuidPredicate = (UnboundPredicate) originalAnd.right(); + assertThat(originalUuidPredicate.literal()) + .isInstanceOf(Literals.UUIDLiteral.class) + .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class)) + .extracting("useSignedComparator") + .isEqualTo(false); + Expression result = ExpressionUtil.toSignedUUIDLiteral(original); assertThat(result).isNotNull().isInstanceOf(And.class); And and = (And) result; diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java index d40fe8d25444..f83b87d61aae 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java @@ -116,6 +116,63 @@ public class TestInclusiveManifestEvaluator { private static final ByteBuffer UUID_MIN = toByteBuffer(Types.UUIDType.get(), UUID_MIN_VALUE); private static final ByteBuffer UUID_MAX = toByteBuffer(Types.UUIDType.get(), UUID_MAX_VALUE); + // Legacy manifest with "inverted" bounds (min > max in unsigned order) + // Used for testing backward compatibility with files written using signed UUID comparator + private static final UUID LEGACY_UUID_MIN = + UUID.fromString("80000000-0000-0000-0000-000000000001"); + private static final UUID LEGACY_UUID_MAX = + UUID.fromString("40000000-0000-0000-0000-000000000001"); + + private static final ManifestFile LEGACY_UUID_MANIFEST = + new TestHelpers.TestManifestFile( + "legacy-uuid-manifest.avro", + 1024, + 0, + System.currentTimeMillis(), + 5, + 10, + 0, + ImmutableList.of( + new TestHelpers.TestFieldSummary(false, INT_MIN, INT_MAX), + new TestHelpers.TestFieldSummary(true, null, null), + new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MAX), + new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MAX), + new TestHelpers.TestFieldSummary( + false, + toByteBuffer(Types.FloatType.get(), 0F), + toByteBuffer(Types.FloatType.get(), 20F)), + new TestHelpers.TestFieldSummary(true, null, null), + new TestHelpers.TestFieldSummary(true, false, null, null), + new TestHelpers.TestFieldSummary(false, true, null, null), + new TestHelpers.TestFieldSummary(true, true, null, null), + new TestHelpers.TestFieldSummary( + false, + false, + toByteBuffer(Types.FloatType.get(), 0F), + toByteBuffer(Types.FloatType.get(), 20F)), + new TestHelpers.TestFieldSummary(true, null, null), + new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MIN), + new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN), + new TestHelpers.TestFieldSummary( + false, + true, + toByteBuffer(Types.FloatType.get(), 5.0F), + toByteBuffer(Types.FloatType.get(), 5.0F)), + new TestHelpers.TestFieldSummary( + false, + toByteBuffer(Types.FloatType.get(), 5.0F), + toByteBuffer(Types.FloatType.get(), 5.0F)), + new TestHelpers.TestFieldSummary( + false, + false, + toByteBuffer(Types.FloatType.get(), 5.0F), + toByteBuffer(Types.FloatType.get(), 5.0F)), + new TestHelpers.TestFieldSummary( + false, + toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_MIN), + toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_MAX))), + null); + private static final ManifestFile NO_STATS = new TestHelpers.TestManifestFile( "manifest-list.avro", 1024, 0, System.currentTimeMillis(), null, null, null, null, null); @@ -969,7 +1026,9 @@ public void testUuidGt() { UUID justBelowMax = UUID.fromString("ffffffff-ffff-fffe-ffff-ffffffffffff"); shouldRead = ManifestEvaluator.forRowFilter(greaterThan("uuid", justBelowMax), SPEC, true).eval(FILE); - assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + assertThat(shouldRead) + .as("Should read: uuid between justBelowMax and the upper bound") + .isTrue(); // UUID_MAX_VALUE is the upper bound, so no rows can be > UUID_MAX_VALUE. // Both RFC and signed comparators agree on this since we're comparing the value to itself. @@ -1048,85 +1107,24 @@ public void testUuidIn() { assertThat(shouldRead).as("Should not read: uuids above upper bound").isFalse(); } - // Tests for legacy UUID file compatibility (files written with signed UUID comparator) + // Tests for legacy UUID file compatibility (files written with signed UUID comparator). // These tests simulate manifests where min/max bounds were computed using Java's signed // comparison. // - // Key insight: Java's UUID.compareTo() compares MSB first, then LSB, both as SIGNED longs. - // This means: - // - UUIDs starting with 0x00-0x7F are "positive" (MSB is positive as signed long) - // - UUIDs starting with 0x80-0xFF are "negative" (MSB is negative as signed long) + // Java's UUID.compareTo() compares MSB first, then LSB, both as SIGNED longs: + // - UUIDs starting with 0x00-0x7F have positive MSB (as signed long) + // - UUIDs starting with 0x80-0xFF have negative MSB (as signed long) // - // Example file containing UUIDs: 0x00..., 0x40..., 0x80... + // For a file containing UUIDs 0x00..., 0x40..., 0x80...: // - Unsigned (RFC) order: 0x00... < 0x40... < 0x80... // - Signed (Java) order: 0x80... < 0x00... < 0x40... // - // If written with signed comparator, the manifest would have: - // - min = 0x80... (smallest in signed order) - // - max = 0x40... (largest in signed order) - - // Legacy manifest with "inverted" bounds (min > max in unsigned order) - private static final UUID LEGACY_UUID_MIN = - UUID.fromString("80000000-0000-0000-0000-000000000001"); - private static final UUID LEGACY_UUID_MAX = - UUID.fromString("40000000-0000-0000-0000-000000000001"); - - private static final ManifestFile LEGACY_UUID_MANIFEST = - new TestHelpers.TestManifestFile( - "legacy-uuid-manifest.avro", - 1024, - 0, - System.currentTimeMillis(), - 5, - 10, - 0, - ImmutableList.of( - new TestHelpers.TestFieldSummary(false, INT_MIN, INT_MAX), - new TestHelpers.TestFieldSummary(true, null, null), - new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MAX), - new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MAX), - new TestHelpers.TestFieldSummary( - false, - toByteBuffer(Types.FloatType.get(), 0F), - toByteBuffer(Types.FloatType.get(), 20F)), - new TestHelpers.TestFieldSummary(true, null, null), - new TestHelpers.TestFieldSummary(true, false, null, null), - new TestHelpers.TestFieldSummary(false, true, null, null), - new TestHelpers.TestFieldSummary(true, true, null, null), - new TestHelpers.TestFieldSummary( - false, - false, - toByteBuffer(Types.FloatType.get(), 0F), - toByteBuffer(Types.FloatType.get(), 20F)), - new TestHelpers.TestFieldSummary(true, null, null), - new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MIN), - new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN), - new TestHelpers.TestFieldSummary( - false, - true, - toByteBuffer(Types.FloatType.get(), 5.0F), - toByteBuffer(Types.FloatType.get(), 5.0F)), - new TestHelpers.TestFieldSummary( - false, - toByteBuffer(Types.FloatType.get(), 5.0F), - toByteBuffer(Types.FloatType.get(), 5.0F)), - new TestHelpers.TestFieldSummary( - false, - false, - toByteBuffer(Types.FloatType.get(), 5.0F), - toByteBuffer(Types.FloatType.get(), 5.0F)), - new TestHelpers.TestFieldSummary( - false, - toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_MIN), - toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_MAX))), - null); + // With signed comparator, the manifest has min=0x80..., max=0x40... (inverted bounds). @Test public void testLegacyUuidManifestIn() { - // Query: uuid IN (0x20..., 0x30...) - // Both are between 0x80... and 0x40... in signed order - // RFC evaluation: Both values are < 0x80... (lower bound) in unsigned, so filtered out - // Legacy evaluation: Uses signed comparator, so 0x80... < 0x20... < 0x30... < 0x40... + // 0x20... and 0x30... are between 0x80... and 0x40... in signed order. + // RFC unsigned filters them out (both < 0x80...), but legacy signed includes them. UUID uuid1 = UUID.fromString("20000000-0000-0000-0000-000000000001"); UUID uuid2 = UUID.fromString("30000000-0000-0000-0000-000000000001"); boolean shouldRead = @@ -1139,20 +1137,14 @@ public void testLegacyUuidManifestIn() { @Test public void testLegacyUuidManifestInAllBelowLowerBound() { - // Query: uuid IN (0x50..., 0x60...) - // Both are > 0x40... (upper bound) in both unsigned and signed order - // RFC evaluation: Both values are < 0x80... (lower bound) in unsigned, so filtered out - // Legacy evaluation: Both values are > 0x40... (upper bound) in signed order, so filtered out - // So both comparators should filter these out + // 0x50... and 0x60... are outside bounds in both orderings: + // - RFC unsigned: both < 0x80... (lower bound) + // - Signed: both > 0x40... (upper bound) UUID uuid1 = UUID.fromString("50000000-0000-0000-0000-000000000001"); UUID uuid2 = UUID.fromString("60000000-0000-0000-0000-000000000001"); boolean shouldRead = ManifestEvaluator.forRowFilter(in("uuid", uuid1, uuid2), SPEC, true) .eval(LEGACY_UUID_MANIFEST); - // In signed order: 0x80... < 0x90... < ... < 0xFF... < 0x00... < ... < 0x40... < 0x50... < - // 0x60... - // So 0x50... and 0x60... are above upper bound [0x40...] in signed order - // In RFC order: 0x50... and 0x60... are below lower bound [0x80...] assertThat(shouldRead) .as("Should not read: UUIDs outside bounds in both RFC and signed order") .isFalse(); @@ -1160,11 +1152,9 @@ public void testLegacyUuidManifestInAllBelowLowerBound() { @Test public void testLegacyUuidManifestEq() { - // Query for 0x20... which is between 0x80... and 0x40... in signed order + // 0x20... is between 0x80... and 0x40... in signed order (within the manifest's actual range). + // RFC unsigned fails (0x20... < 0x80...), but legacy signed succeeds. UUID queryUuid = UUID.fromString("20000000-0000-0000-0000-000000000001"); - - // RFC evaluation would fail: 0x20... < 0x80... in unsigned, so outside [0x80..., 0x40...] - // Legacy evaluation should succeed: 0x80... < 0x20... < 0x40... in signed boolean shouldRead = ManifestEvaluator.forRowFilter(equal("uuid", queryUuid), SPEC, true) .eval(LEGACY_UUID_MANIFEST); @@ -1175,9 +1165,7 @@ public void testLegacyUuidManifestEq() { @Test public void testLegacyUuidManifestLt() { - // Query: uuid < 0x30... - // In signed order, values less than 0x30... include 0x80..., 0x00..., 0x20... - // The manifest's min is 0x80... which is < 0x30... in signed order + // In signed order, 0x80... < 0x30..., so the manifest's min satisfies uuid < 0x30... UUID queryUuid = UUID.fromString("30000000-0000-0000-0000-000000000001"); boolean shouldRead = ManifestEvaluator.forRowFilter(lessThan("uuid", queryUuid), SPEC, true) @@ -1189,9 +1177,7 @@ public void testLegacyUuidManifestLt() { @Test public void testLegacyUuidManifestGt() { - // Query: uuid > 0x20... - // In signed order, values greater than 0x20... include 0x30..., 0x40... - // The manifest's max is 0x40... which is > 0x20... in signed order + // In signed order, 0x40... > 0x20..., so the manifest's max satisfies uuid > 0x20... UUID queryUuid = UUID.fromString("20000000-0000-0000-0000-000000000001"); boolean shouldRead = ManifestEvaluator.forRowFilter(greaterThan("uuid", queryUuid), SPEC, true) diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java index 1ec7c54898bd..26ae8ba08ec6 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java @@ -103,6 +103,28 @@ public class TestInclusiveMetricsEvaluator { private static final UUID UUID_MAX_VALUE = UUID.fromString("ffffffff-ffff-ffff-0000-000000000000"); + // Legacy file with "inverted" bounds (min > max in unsigned order) + // Used for testing backward compatibility with files written using signed UUID comparator + // File contains UUIDs from 0x80... to 0x40... (in signed order) + // This represents actual values: 0x80..., 0x00..., 0x20..., 0x40... + private static final UUID LEGACY_SIGNED_MIN = + UUID.fromString("80000000-0000-0000-0000-000000000001"); // a small value in signed order + private static final UUID LEGACY_SIGNED_MAX = + UUID.fromString("40000000-0000-0000-0000-000000000001"); // a large value in signed order + + private static final DataFile LEGACY_UUID_FILE = + new TestDataFile( + "legacy_uuid_file.avro", + Row.of(), + 50, + ImmutableMap.of(15, 50L), + ImmutableMap.of(15, 0L), + null, + // lower bound: 0x80... (smallest in signed comparison) + ImmutableMap.of(15, toByteBuffer(Types.UUIDType.get(), LEGACY_SIGNED_MIN)), + // upper bound: 0x40... (largest in signed comparison) + ImmutableMap.of(15, toByteBuffer(Types.UUIDType.get(), LEGACY_SIGNED_MAX))); + private static final DataFile FILE = new TestDataFile( "file.avro", @@ -1363,56 +1385,28 @@ public void testUuidNotIn() { assertThat(shouldRead).as("Should read: notIn always reads").isTrue(); } - // Tests for legacy UUID file compatibility (files written with signed UUID comparator) + // Tests for legacy UUID file compatibility (files written with signed UUID comparator). // These tests simulate files where min/max bounds were computed using Java's signed comparison. // - // Key insight: Java's UUID.compareTo() compares MSB first, then LSB, both as SIGNED longs. - // This means: - // - UUIDs starting with 0x00-0x7F are "positive" (MSB is positive as signed long) - // - UUIDs starting with 0x80-0xFF are "negative" (MSB is negative as signed long) + // Java's UUID.compareTo() compares MSB first, then LSB, both as SIGNED longs: + // - UUIDs starting with 0x00-0x7F have positive MSB (as signed long) + // - UUIDs starting with 0x80-0xFF have negative MSB (as signed long) // - // Example file containing UUIDs: 0x00..., 0x40..., 0x80... + // For a file containing UUIDs 0x00..., 0x40..., 0x80...: // - Unsigned (RFC) order: 0x00... < 0x40... < 0x80... // - Signed (Java) order: 0x80... < 0x00... < 0x40... // - // If written with signed comparator, the file would have: - // - min = 0x80... (smallest in signed order) - // - max = 0x40... (largest in signed order) + // With signed comparator, the file has min=0x80..., max=0x40... (inverted bounds). // - // Query for 0x20... (which is in the file): - // - RFC evaluation: Is 0x20... in [0x80..., 0x40...]? - // In unsigned: 0x20... < 0x80... (lower bound), so NO - incorrectly skipped! - // - Legacy evaluation: Is 0x20... in [0x80..., 0x40...]? - // In signed: 0x80... < 0x20... < 0x40..., so YES - correctly included! - - // File contains UUIDs from 0x80... to 0x40... (in signed order) - // This represents actual values: 0x80..., 0x00..., 0x20..., 0x40... - private static final UUID LEGACY_SIGNED_MIN = - UUID.fromString("80000000-0000-0000-0000-000000000001"); // Smallest in signed order - private static final UUID LEGACY_SIGNED_MAX = - UUID.fromString("40000000-0000-0000-0000-000000000001"); // Largest in signed order - - private static final DataFile LEGACY_UUID_FILE = - new TestDataFile( - "legacy_uuid_file.avro", - Row.of(), - 50, - ImmutableMap.of(15, 50L), - ImmutableMap.of(15, 0L), - null, - // lower bound: 0x80... (smallest in signed comparison) - ImmutableMap.of(15, toByteBuffer(Types.UUIDType.get(), LEGACY_SIGNED_MIN)), - // upper bound: 0x40... (largest in signed comparison) - ImmutableMap.of(15, toByteBuffer(Types.UUIDType.get(), LEGACY_SIGNED_MAX))); + // Querying for 0x20... (which exists in the file): + // - RFC unsigned: 0x20... < 0x80... (lower bound), so incorrectly skipped + // - Legacy signed: 0x80... < 0x20... < 0x40..., so correctly included @Test public void testLegacyUuidFileEq() { - // Query for 0x20... which is between 0x80... and 0x40... in signed order - // (i.e., it's in the file's actual range) + // 0x20... is between 0x80... and 0x40... in signed order (within the file's actual range). + // RFC unsigned fails (0x20... < 0x80...), but legacy signed succeeds. UUID queryUuid = UUID.fromString("20000000-0000-0000-0000-000000000001"); - - // RFC evaluation would fail: 0x20... < 0x80... in unsigned, so outside [0x80..., 0x40...] - // Legacy evaluation should succeed: 0x80... < 0x20... < 0x40... in signed boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", queryUuid)).eval(LEGACY_UUID_FILE); assertThat(shouldRead) @@ -1422,9 +1416,7 @@ public void testLegacyUuidFileEq() { @Test public void testLegacyUuidFileLt() { - // Query: uuid < 0x30... - // In signed order, values less than 0x30... include 0x80..., 0x00..., 0x20... - // The file contains 0x80... which is < 0x30... in signed order + // In signed order, 0x80... < 0x30..., so the file's min satisfies uuid < 0x30... UUID queryUuid = UUID.fromString("30000000-0000-0000-0000-000000000001"); boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", queryUuid)).eval(LEGACY_UUID_FILE); @@ -1435,9 +1427,7 @@ public void testLegacyUuidFileLt() { @Test public void testLegacyUuidFileGt() { - // Query: uuid > 0x20... - // In signed order, values greater than 0x20... include 0x30..., 0x40... - // The file's max is 0x40... which is > 0x20... in signed order + // In signed order, 0x40... > 0x20..., so the file's max satisfies uuid > 0x20... UUID queryUuid = UUID.fromString("20000000-0000-0000-0000-000000000001"); boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", queryUuid)) @@ -1449,10 +1439,8 @@ public void testLegacyUuidFileGt() { @Test public void testLegacyUuidFileIn() { - // Query: uuid IN (0x20..., 0x30...) - // Both are between 0x80... and 0x40... in signed order - // RFC evaluation: Both values are < 0x80... (lower bound) in unsigned, so filtered out - // Legacy evaluation: Uses signed comparator, so 0x80... < 0x20... < 0x30... < 0x40... + // 0x20... and 0x30... are between 0x80... and 0x40... in signed order. + // RFC unsigned filters them out (both < 0x80...), but legacy signed includes them. UUID uuid1 = UUID.fromString("20000000-0000-0000-0000-000000000001"); UUID uuid2 = UUID.fromString("30000000-0000-0000-0000-000000000001"); boolean shouldRead = @@ -1464,8 +1452,7 @@ public void testLegacyUuidFileIn() { @Test public void testNonUuidPredicateNoLegacyFallback() { - // Verify that non-UUID predicates don't trigger legacy fallback (no performance impact) - // This is a sanity check - the evaluator should work normally for non-UUID columns + // Non-UUID predicates should not trigger legacy fallback (no performance impact). boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("id", 50)).eval(FILE); assertThat(shouldRead).as("Should read: id 50 is in range [30, 79]").isTrue(); diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java index 5508fb173aaa..0c243b2f17a5 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java @@ -735,24 +735,59 @@ SCHEMA, lessThanOrEqual("struct.nested_col_with_stats", INT_MAX_VALUE)) } // Tests for UUID with StrictMetricsEvaluator using RFC-compliant comparison only + // UUID_FILE has bounds [UUID_MIN=0x00...01, UUID_MAX=0x80...01] @Test public void testStrictUuidGt() { - // Query: uuid > 0x00... (all UUIDs in file should be > this) + // Query: uuid > belowMin (all UUIDs in file should be > this) UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); boolean allMatch = new StrictMetricsEvaluator(SCHEMA, greaterThan("uuid", belowMin)).eval(UUID_FILE); - // With bounds [UUID_MIN, UUID_MAX], all values should be > belowMin assertThat(allMatch).as("All UUIDs should be greater than belowMin").isTrue(); + + // Query: uuid > UUID_MIN (lower bound) - not all values are > lower bound + allMatch = new StrictMetricsEvaluator(SCHEMA, greaterThan("uuid", UUID_MIN)).eval(UUID_FILE); + assertThat(allMatch).as("Not all UUIDs are greater than lower bound").isFalse(); + + // Query: uuid > middle value - not all values are > middle + UUID middle = UUID.fromString("40000000-0000-0000-0000-000000000001"); + allMatch = new StrictMetricsEvaluator(SCHEMA, greaterThan("uuid", middle)).eval(UUID_FILE); + assertThat(allMatch).as("Not all UUIDs are greater than middle").isFalse(); + + // Query: uuid > UUID_MAX (upper bound) - no values are > upper bound + allMatch = new StrictMetricsEvaluator(SCHEMA, greaterThan("uuid", UUID_MAX)).eval(UUID_FILE); + assertThat(allMatch).as("No UUIDs are greater than upper bound").isFalse(); + + // Query: uuid > aboveMax - no values are > aboveMax + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + allMatch = new StrictMetricsEvaluator(SCHEMA, greaterThan("uuid", aboveMax)).eval(UUID_FILE); + assertThat(allMatch).as("No UUIDs are greater than aboveMax").isFalse(); } @Test public void testStrictUuidLt() { - // Query: uuid < 0xFF... (all UUIDs in file should be < this) - UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + // Query: uuid < belowMin - no values are < belowMin + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); boolean allMatch = - new StrictMetricsEvaluator(SCHEMA, lessThan("uuid", aboveMax)).eval(UUID_FILE); - // With bounds [UUID_MIN, UUID_MAX], all values should be < aboveMax + new StrictMetricsEvaluator(SCHEMA, lessThan("uuid", belowMin)).eval(UUID_FILE); + assertThat(allMatch).as("No UUIDs are less than belowMin").isFalse(); + + // Query: uuid < UUID_MIN (lower bound) - no values are < lower bound + allMatch = new StrictMetricsEvaluator(SCHEMA, lessThan("uuid", UUID_MIN)).eval(UUID_FILE); + assertThat(allMatch).as("No UUIDs are less than lower bound").isFalse(); + + // Query: uuid < middle value - not all values are < middle + UUID middle = UUID.fromString("40000000-0000-0000-0000-000000000001"); + allMatch = new StrictMetricsEvaluator(SCHEMA, lessThan("uuid", middle)).eval(UUID_FILE); + assertThat(allMatch).as("Not all UUIDs are less than middle").isFalse(); + + // Query: uuid < UUID_MAX (upper bound) - not all values are < upper bound + allMatch = new StrictMetricsEvaluator(SCHEMA, lessThan("uuid", UUID_MAX)).eval(UUID_FILE); + assertThat(allMatch).as("Not all UUIDs are less than upper bound").isFalse(); + + // Query: uuid < aboveMax (all UUIDs in file should be < this) + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + allMatch = new StrictMetricsEvaluator(SCHEMA, lessThan("uuid", aboveMax)).eval(UUID_FILE); assertThat(allMatch).as("All UUIDs should be less than aboveMax").isTrue(); } diff --git a/api/src/test/java/org/apache/iceberg/types/TestComparators.java b/api/src/test/java/org/apache/iceberg/types/TestComparators.java index 8a27bd84109d..7127050d60fc 100644 --- a/api/src/test/java/org/apache/iceberg/types/TestComparators.java +++ b/api/src/test/java/org/apache/iceberg/types/TestComparators.java @@ -108,7 +108,7 @@ public void testUuid() { Comparators.forType(Types.UUIDType.get()), UUID.fromString("60000000-0000-0000-0000-000000000000"), UUID.fromString("70000000-0000-0000-0000-000000000000")); - // The following assertion fails prior to the introduction of UUIDComparator. + // The following assertions fail prior to the introduction of unsigned UUID comparator. assertComparesCorrectly( Comparators.forType(Types.UUIDType.get()), UUID.fromString("70000000-0000-0000-0000-000000000000"), @@ -131,6 +131,47 @@ public void testUuid() { UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff")); } + @Test + public void testSignedUuid() { + // Test the signed UUID comparator used for backward compatibility with legacy files. + // In signed comparison, UUIDs with high bit set (0x80...) are treated as negative, + // so they compare less than UUIDs without high bit set (0x00-0x7F...). + Comparator signedCmp = Comparators.signedUUIDs(); + + // Within the "positive" range (0x00-0x7F), ordering is the same as unsigned + assertComparesCorrectly( + signedCmp, + UUID.fromString("00000000-0000-0000-0000-000000000000"), + UUID.fromString("40000000-0000-0000-0000-000000000000")); + assertComparesCorrectly( + signedCmp, + UUID.fromString("40000000-0000-0000-0000-000000000000"), + UUID.fromString("70000000-0000-0000-0000-000000000000")); + + // Within the "negative" range (0x80-0xFF), ordering is the same as unsigned + assertComparesCorrectly( + signedCmp, + UUID.fromString("80000000-0000-0000-0000-000000000000"), + UUID.fromString("90000000-0000-0000-0000-000000000000")); + assertComparesCorrectly( + signedCmp, + UUID.fromString("c0000000-0000-0000-0000-000000000000"), + UUID.fromString("f0000000-0000-0000-0000-000000000000")); + + // In signed comparison, 0x80... < 0x70... because the high bit is treated as negative. + // In unsigned comparison, 0x70... < 0x80... because bytes are compared as unsigned values. + UUID highBit = UUID.fromString("80000000-0000-0000-0000-000000000000"); + UUID lowBit = UUID.fromString("70000000-0000-0000-0000-000000000000"); + assertThat(signedCmp.compare(highBit, lowBit)) + .as("In signed comparison, 0x80... should be less than 0x70...") + .isLessThan(0); + + Comparator unsignedCmp = Comparators.forType(Types.UUIDType.get()); + assertThat(unsignedCmp.compare(highBit, lowBit)) + .as("In unsigned comparison, 0x80... should be greater than 0x70...") + .isGreaterThan(0); + } + @Test public void testFixed() { assertComparesCorrectly( diff --git a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java index 9437911c8902..4c0b6b4d3844 100644 --- a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java +++ b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java @@ -41,10 +41,13 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.assertj.core.api.Assumptions.assumeThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; import java.io.File; import java.io.IOException; import java.io.UncheckedIOException; +import java.nio.ByteBuffer; import java.util.Arrays; import java.util.List; import java.util.UUID; @@ -69,21 +72,27 @@ import org.apache.iceberg.orc.ORC; import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.parquet.ParquetMetricsRowGroupFilter; +import org.apache.iceberg.parquet.ParquetSchemaUtil; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.DoubleType; import org.apache.iceberg.types.Types.FloatType; import org.apache.iceberg.types.Types.IntegerType; import org.apache.iceberg.types.Types.StringType; +import org.apache.iceberg.util.UUIDUtil; import org.apache.iceberg.variants.ShreddedObject; import org.apache.iceberg.variants.Variant; import org.apache.iceberg.variants.VariantMetadata; import org.apache.iceberg.variants.Variants; import org.apache.orc.OrcFile; import org.apache.orc.Reader; +import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.io.DelegatingSeekableInputStream; +import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.MessageType; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.TestTemplate; @@ -1151,8 +1160,8 @@ public void testUUID() { /** * Tests UUID filtering with values that span the signed/unsigned comparison boundary. In RFC 9562 - * unsigned comparison: UUID_LOW < UUID_MID < UUID_HIGH < UUID_HIGHER In legacy signed comparison: - * UUID_HIGH < UUID_HIGHER < UUID_LOW < UUID_MID (high bit treated as negative) + * unsigned comparison: UUID_LOW < UUID_MID < UUID_HIGH < UUID_HIGHER. In legacy signed + * comparison: UUID_HIGH < UUID_HIGHER < UUID_LOW < UUID_MID (high bit treated as negative). * *

The dual-comparator approach ensures we don't incorrectly skip files that might contain * matching rows when reading legacy files with inverted UUID bounds. @@ -1276,6 +1285,119 @@ public void testUUIDComparisonBoundary() throws IOException { } } + /** + * Tests reading a legacy Parquet file's metadata where UUID min/max statistics were computed + * using signed comparison. + * + *

In signed comparison, UUIDs with high bit set (0x80...) are treated as negative, so they + * sort before UUIDs without high bit set. This creates "inverted bounds" where min > max in RFC + * unsigned order. + * + *

This test uses mocked Parquet metadata to simulate such a legacy file, verifying that the + * dual-comparator approach correctly handles queries that would otherwise be incorrectly filtered + * out using the unsigned comparison. + */ + @TestTemplate + public void testLegacyUUIDParquetFileWithSignedComparator() throws IOException { + assumeThat(format).as("Only valid for Parquet").isEqualTo(FileFormat.PARQUET); + + MessageType legacySchema = ParquetSchemaUtil.convert(UUID_SCHEMA, "legacy_uuid_test"); + + // Simulate inverted bounds from a legacy file written with signed comparator. + // In signed order: 0x80... (negative) < 0x00... < 0x20... < 0x40... (positive) + // So min=0x80..., max=0x40... represents a valid range in signed order, + // but appears inverted in RFC unsigned order where 0x40... < 0x80... + // Therefore, with unsigned UUID expression evaluation, the Parquet row group is not read. + UUID legacyMin = UUID.fromString("80000000-0000-0000-0000-000000000001"); + UUID legacyMax = UUID.fromString("40000000-0000-0000-0000-000000000001"); + + BlockMetaData mockRowGroup = createMockRowGroupWithUUIDStats(legacyMin, legacyMax); + + // Test 1: equal(0x20...) + // In unsigned order: 0x20... < 0x80... (min), fails lower bound check + // In signed order: 0x80... < 0x20... < 0x40..., passes bounds check + UUID queryInRange = UUID.fromString("20000000-0000-0000-0000-000000000001"); + assertThat(shouldReadUUID(equal("uuid_col", queryInRange), legacySchema, mockRowGroup)) + .as("Should read: signed fallback finds UUID in signed-ordered bounds") + .isTrue(); + + // Test 2: lessThan(0x30...) + // In unsigned order: min (0x80...) > 0x30..., fails lower bound check + // In signed order: min (0x80...) < 0x30..., passes lower bound check + UUID queryLt = UUID.fromString("30000000-0000-0000-0000-000000000001"); + assertThat(shouldReadUUID(lessThan("uuid_col", queryLt), legacySchema, mockRowGroup)) + .as("Should read: signed fallback finds UUIDs less than query in signed order") + .isTrue(); + + // Test 3: greaterThan(0x20...) + // In unsigned order: max (0x40...) < 0x80... (min), bounds are inverted + // In signed order: max (0x40...) > 0x20..., passes upper bound check + assertThat(shouldReadUUID(greaterThan("uuid_col", queryInRange), legacySchema, mockRowGroup)) + .as("Should read: signed fallback finds UUIDs greater than query in signed order") + .isTrue(); + + // Test 4: in(0x20..., 0x30...) + // In unsigned order: both values < 0x80... (min), fails lower bound check + // In signed order: both values between 0x80... and 0x40..., passes bounds check + assertThat(shouldReadUUID(in("uuid_col", queryInRange, queryLt), legacySchema, mockRowGroup)) + .as("Should read: signed fallback finds UUIDs in signed-ordered bounds") + .isTrue(); + + // Test 5: equal(0x50...) + // In unsigned order: 0x50... < 0x80... (min), fails lower bound check + // In signed order: 0x50... > 0x40... (max), fails upper bound check + UUID outsideBounds = UUID.fromString("50000000-0000-0000-0000-000000000001"); + assertThat(shouldReadUUID(equal("uuid_col", outsideBounds), legacySchema, mockRowGroup)) + .as("Should not read: UUID outside bounds in both RFC and signed order") + .isFalse(); + } + + /** + * Creates a mock BlockMetaData with UUID column statistics set to the specified min/max values. + * This simulates a legacy Parquet file where statistics were computed using signed comparison. + */ + private BlockMetaData createMockRowGroupWithUUIDStats(UUID minUuid, UUID maxUuid) { + // Convert UUIDs to Binary (16-byte fixed-length) + ByteBuffer minBuffer = UUIDUtil.convertToByteBuffer(minUuid); + ByteBuffer maxBuffer = UUIDUtil.convertToByteBuffer(maxUuid); + Binary minBinary = Binary.fromConstantByteBuffer(minBuffer); + Binary maxBinary = Binary.fromConstantByteBuffer(maxBuffer); + + // Create mock Statistics + @SuppressWarnings("unchecked") + Statistics mockStats = mock(Statistics.class); + when(mockStats.isEmpty()).thenReturn(false); + when(mockStats.hasNonNullValue()).thenReturn(true); + when(mockStats.isNumNullsSet()).thenReturn(true); + when(mockStats.getNumNulls()).thenReturn(0L); + when(mockStats.genericGetMin()).thenReturn(minBinary); + when(mockStats.genericGetMax()).thenReturn(maxBinary); + when(mockStats.getMinBytes()).thenReturn(minBuffer.array()); + when(mockStats.getMaxBytes()).thenReturn(maxBuffer.array()); + + // Create mock ColumnChunkMetaData for the UUID column + ColumnChunkMetaData mockUuidColumn = mock(ColumnChunkMetaData.class); + when(mockUuidColumn.getPath()).thenReturn(ColumnPath.fromDotString("uuid_col")); + when(mockUuidColumn.getStatistics()).thenReturn(mockStats); + when(mockUuidColumn.getValueCount()).thenReturn(50L); + + // Create mock ColumnChunkMetaData for the id column (required for schema) + ColumnChunkMetaData mockIdColumn = mock(ColumnChunkMetaData.class); + when(mockIdColumn.getPath()).thenReturn(ColumnPath.fromDotString("id")); + @SuppressWarnings("unchecked") + Statistics mockIdStats = mock(Statistics.class); + when(mockIdStats.isEmpty()).thenReturn(true); + when(mockIdColumn.getStatistics()).thenReturn(mockIdStats); + when(mockIdColumn.getValueCount()).thenReturn(50L); + + // Create mock BlockMetaData + BlockMetaData mockRowGroup = mock(BlockMetaData.class); + when(mockRowGroup.getRowCount()).thenReturn(50L); + when(mockRowGroup.getColumns()).thenReturn(Arrays.asList(mockIdColumn, mockUuidColumn)); + + return mockRowGroup; + } + private boolean shouldReadUUID( Expression expression, MessageType messageType, BlockMetaData blockMetaData) { return new ParquetMetricsRowGroupFilter(UUID_SCHEMA, expression, true) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java index b1ee2d46f135..a459ae710222 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java @@ -81,7 +81,7 @@ public ParquetMetricsRowGroupFilter(Schema schema, Expression unbound, boolean c * @return false if the file cannot contain rows that match the expression, true otherwise. */ public boolean shouldRead(MessageType fileSchema, BlockMetaData rowGroup) { - boolean result = new MetricsEvalVisitor().eval(fileSchema, rowGroup, false); + boolean result = new MetricsEvalVisitor(schema).eval(fileSchema, rowGroup, expr, false); // If the RFC-compliant evaluation says rows might match, or there's no signed UUID expression, // return the result. @@ -93,22 +93,31 @@ public boolean shouldRead(MessageType fileSchema, BlockMetaData rowGroup) { // whether signed or unsigned comparator was used when the UUID column stats were written. // The signedUuidExpr has literals with signed comparators for lt/gt/eq predicates. // For IN predicates, signed comparator is used via Comparators#comparatorFor. - return new MetricsEvalVisitor().eval(fileSchema, rowGroup, true); + return new MetricsEvalVisitor(schema).eval(fileSchema, rowGroup, signedUuidExpr, true); } private static final boolean ROWS_MIGHT_MATCH = true; private static final boolean ROWS_CANNOT_MATCH = false; - private class MetricsEvalVisitor extends BoundExpressionVisitor { + private static class MetricsEvalVisitor extends BoundExpressionVisitor { + private final Schema schema; private Map> stats = null; private Map valueCounts = null; private Map> conversions = null; // Flag to use signed UUID comparator for backward compatibility. - // This is needed for the IN predicate because the comparator information is lost - // when binding converts literals to a Set of raw values. + // This is specifically necessary for IN predicates because the comparator information + // is lost when binding converts literals to a Set of raw values. private boolean useSignedUuidComparator = false; - private boolean eval(MessageType fileSchema, BlockMetaData rowGroup, boolean signedUuidMode) { + private MetricsEvalVisitor(Schema schema) { + this.schema = schema; + } + + private boolean eval( + MessageType fileSchema, + BlockMetaData rowGroup, + Expression expression, + boolean signedUuidMode) { if (rowGroup.getRowCount() <= 0) { return ROWS_CANNOT_MATCH; } @@ -128,7 +137,7 @@ private boolean eval(MessageType fileSchema, BlockMetaData rowGroup, boolean sig } } - return ExpressionVisitors.visitEvaluator(signedUuidMode ? signedUuidExpr : expr, this); + return ExpressionVisitors.visitEvaluator(expression, this); } @Override