diff --git a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java
index 9bb2b713439d..80dd4466f629 100644
--- a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java
+++ b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java
@@ -24,12 +24,14 @@
import java.time.temporal.ChronoUnit;
import java.util.List;
import java.util.Locale;
+import java.util.UUID;
import java.util.concurrent.TimeUnit;
import java.util.function.Function;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.StreamSupport;
+import javax.annotation.Nullable;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
import org.apache.iceberg.Table;
@@ -744,4 +746,105 @@ private static PartitionSpec identitySpec(Schema schema, int... ids) {
return specBuilder.build();
}
+
+ /**
+ * Transform UUID literals in an unbound expression to use signed comparators, if the expression
+ * contains UUID bounds predicates. This maintains backward compatibility with files written
+ * before RFC 4122/9562 compliant comparison was implemented.
+ *
+ *
The transformed expression contains literals with signed comparators for lt/gt/eq
+ * predicates. For IN predicates, the comparator information is lost when binding converts
+ * literals to a Set of raw values, so the evaluator must handle this separately.
+ *
+ * @param expr an unbound expression
+ * @return the expression with signed UUID comparators, or null if no UUID predicates are present
+ */
+ @Nullable
+ public static Expression toSignedUUIDLiteral(Expression expr) {
+ SignedUUIDLiteralVisitor visitor = new SignedUUIDLiteralVisitor();
+ Expression transformed = ExpressionVisitors.visit(expr, visitor);
+ return visitor.foundUUIDBoundsPredicate() ? transformed : null;
+ }
+
+ /**
+ * Visitor that transforms an expression to use the signed UUID comparator in all UUID literals,
+ * while also tracking whether any UUID bounds predicates were found.
+ */
+ private static class SignedUUIDLiteralVisitor
+ extends ExpressionVisitors.ExpressionVisitor {
+
+ private boolean foundUUIDBoundsPredicate = false;
+
+ boolean foundUUIDBoundsPredicate() {
+ return foundUUIDBoundsPredicate;
+ }
+
+ @Override
+ public Expression alwaysTrue() {
+ return Expressions.alwaysTrue();
+ }
+
+ @Override
+ public Expression alwaysFalse() {
+ return Expressions.alwaysFalse();
+ }
+
+ @Override
+ public Expression not(Expression result) {
+ return Expressions.not(result);
+ }
+
+ @Override
+ public Expression and(Expression leftResult, Expression rightResult) {
+ return Expressions.and(leftResult, rightResult);
+ }
+
+ @Override
+ public Expression or(Expression leftResult, Expression rightResult) {
+ return Expressions.or(leftResult, rightResult);
+ }
+
+ @Override
+ public Expression predicate(BoundPredicate pred) {
+ // Bound predicates should not be transformed - this is for unbound expressions
+ throw new UnsupportedOperationException(
+ "Cannot transform bound predicate; use unbound expressions");
+ }
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public Expression predicate(UnboundPredicate pred) {
+ switch (pred.op()) {
+ case LT:
+ case LT_EQ:
+ case GT:
+ case GT_EQ:
+ case EQ:
+ case NOT_EQ:
+ if (pred.literal().value() instanceof UUID) {
+ foundUUIDBoundsPredicate = true;
+ Literals.UUIDLiteral uuidLit = (Literals.UUIDLiteral) pred.literal();
+ return new UnboundPredicate<>(
+ pred.op(), pred.term(), (T) uuidLit.withSignedComparator());
+ }
+ return pred;
+
+ case IN:
+ case NOT_IN:
+ List> literals = pred.literals();
+ if (!literals.isEmpty() && literals.get(0).value() instanceof UUID) {
+ foundUUIDBoundsPredicate = true;
+ List transformedValues =
+ literals.stream()
+ .map(l -> (T) ((Literals.UUIDLiteral) l).withSignedComparator())
+ .collect(Collectors.toList());
+ return new UnboundPredicate<>(pred.op(), pred.term(), transformedValues);
+ }
+ return pred;
+
+ default:
+ return pred;
+ }
+ }
+ }
}
diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java
index 81cbbe785519..5a6c52df4e49 100644
--- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java
+++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java
@@ -56,6 +56,10 @@ public class InclusiveMetricsEvaluator {
private static final int IN_PREDICATE_LIMIT = 200;
private final Expression expr;
+ // Expression using signed UUID comparator for backward compatibility with files written
+ // prior to the introduction of RFC-compliant UUID comparisons.
+ // Null if there are no UUID predicates comparing against bounds in the expression.
+ private final Expression signedUuidExpr;
public InclusiveMetricsEvaluator(Schema schema, Expression unbound) {
this(schema, unbound, true);
@@ -63,7 +67,14 @@ public InclusiveMetricsEvaluator(Schema schema, Expression unbound) {
public InclusiveMetricsEvaluator(Schema schema, Expression unbound, boolean caseSensitive) {
StructType struct = schema.asStruct();
- this.expr = Binder.bind(struct, rewriteNot(unbound), caseSensitive);
+ Expression rewritten = rewriteNot(unbound);
+ this.expr = Binder.bind(struct, rewritten, caseSensitive);
+
+ // Create the signed UUID expression if and only if there are UUID predicates
+ // that compare against bounds.
+ Expression transformed = ExpressionUtil.toSignedUUIDLiteral(rewritten);
+ this.signedUuidExpr =
+ transformed != null ? Binder.bind(struct, transformed, caseSensitive) : null;
}
/**
@@ -74,20 +85,34 @@ public InclusiveMetricsEvaluator(Schema schema, Expression unbound, boolean case
*/
public boolean eval(ContentFile> file) {
// TODO: detect the case where a column is missing from the file using file's max field id.
- return new MetricsEvalVisitor().eval(file);
+ boolean result = new MetricsEvalVisitor().eval(file, expr, false);
+
+ // If the RFC-compliant evaluation says rows might match, or there's no signed UUID expression,
+ // return the result.
+ if (result || signedUuidExpr == null) {
+ return result;
+ }
+
+ // Always try with signed UUID comparator as a fallback. There is no reliable way to detect
+ // whether signed or unsigned comparator was used when the UUID column stats were written.
+ return new MetricsEvalVisitor().eval(file, signedUuidExpr, true);
}
private static final boolean ROWS_MIGHT_MATCH = true;
private static final boolean ROWS_CANNOT_MATCH = false;
- private class MetricsEvalVisitor extends ExpressionVisitors.BoundVisitor {
+ private static class MetricsEvalVisitor extends ExpressionVisitors.BoundVisitor {
private Map valueCounts = null;
private Map nullCounts = null;
private Map nanCounts = null;
private Map lowerBounds = null;
private Map upperBounds = null;
+ // Flag to use signed UUID comparator for backward compatibility.
+ // This is needed for the IN predicate because the comparator information is lost
+ // when binding converts literals to a Set of raw values.
+ private boolean useSignedUuidComparator = false;
- private boolean eval(ContentFile> file) {
+ private boolean eval(ContentFile> file, Expression expression, boolean signedUuidMode) {
if (file.recordCount() == 0) {
return ROWS_CANNOT_MATCH;
}
@@ -104,8 +129,9 @@ private boolean eval(ContentFile> file) {
this.nanCounts = file.nanValueCounts();
this.lowerBounds = file.lowerBounds();
this.upperBounds = file.upperBounds();
+ this.useSignedUuidComparator = signedUuidMode;
- return ExpressionVisitors.visitEvaluator(expr, this);
+ return ExpressionVisitors.visitEvaluator(expression, this);
}
@Override
@@ -359,10 +385,12 @@ public Boolean in(Bound term, Set literalSet) {
return ROWS_MIGHT_MATCH;
}
+ // use an appropriate comparator, for UUIDs use the signed comparator if requested
+ Comparator cmp =
+ Comparators.comparatorFor(
+ term.ref().type(), ((BoundTerm) term).comparator(), useSignedUuidComparator);
literals =
- literals.stream()
- .filter(v -> ((BoundTerm) term).comparator().compare(lower, v) <= 0)
- .collect(Collectors.toList());
+ literals.stream().filter(v -> cmp.compare(lower, v) <= 0).collect(Collectors.toList());
// if all values are less than lower bound, rows cannot match
if (literals.isEmpty()) {
return ROWS_CANNOT_MATCH;
@@ -374,9 +402,7 @@ public Boolean in(Bound term, Set literalSet) {
}
literals =
- literals.stream()
- .filter(v -> ((BoundTerm) term).comparator().compare(upper, v) >= 0)
- .collect(Collectors.toList());
+ literals.stream().filter(v -> cmp.compare(upper, v) >= 0).collect(Collectors.toList());
// if remaining values are greater than upper bound, rows cannot match
if (literals.isEmpty()) {
return ROWS_CANNOT_MATCH;
diff --git a/api/src/main/java/org/apache/iceberg/expressions/Literals.java b/api/src/main/java/org/apache/iceberg/expressions/Literals.java
index 3a45eb804f35..a1d8a39772bd 100644
--- a/api/src/main/java/org/apache/iceberg/expressions/Literals.java
+++ b/api/src/main/java/org/apache/iceberg/expressions/Literals.java
@@ -608,9 +608,25 @@ public String toString() {
}
}
- static class UUIDLiteral extends ComparableLiteral {
+ static class UUIDLiteral extends BaseLiteral {
+ // unsigned byte-wise comparator per RFC 4122/9562 specification
+ private static final Comparator UNSIGNED_CMP =
+ Comparators.nullsFirst().thenComparing(Comparators.uuids());
+ // signed comparator for backward compatibility with files written before the introduction of
+ // RFC-compliant comparison
+ private static final Comparator SIGNED_CMP =
+ Comparators.nullsFirst().thenComparing(Comparators.signedUUIDs());
+
+ // Flag to indicate which comparator to use (serializable)
+ private final boolean useSignedComparator;
+
UUIDLiteral(UUID value) {
+ this(value, false);
+ }
+
+ UUIDLiteral(UUID value, boolean useSignedComparator) {
super(value);
+ this.useSignedComparator = useSignedComparator;
}
@Override
@@ -622,10 +638,23 @@ public Literal to(Type type) {
return null;
}
+ @Override
+ public Comparator comparator() {
+ return useSignedComparator ? SIGNED_CMP : UNSIGNED_CMP;
+ }
+
@Override
protected Type.TypeID typeId() {
return Type.TypeID.UUID;
}
+
+ /**
+ * Creates a new UUIDLiteral with the signed comparator for backward compatibility with files
+ * written before RFC-compliant UUID comparisons were introduced.
+ */
+ UUIDLiteral withSignedComparator() {
+ return new UUIDLiteral(value(), true);
+ }
}
static class FixedLiteral extends BaseLiteral {
diff --git a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java
index 21762946ca33..1a213e7a666d 100644
--- a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java
+++ b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java
@@ -34,6 +34,7 @@
import org.apache.iceberg.types.Comparators;
import org.apache.iceberg.types.Conversions;
import org.apache.iceberg.types.Type;
+import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.BinaryUtil;
/**
@@ -51,6 +52,9 @@ public class ManifestEvaluator {
private static final int IN_PREDICATE_LIMIT = 200;
private final Expression expr;
+ // Expression using signed UUID comparator for backward compatibility with files written before
+ // RFC-compliant UUID comparisons were introduced. Null if no UUID predicates.
+ private final Expression signedUuidExpr;
public static ManifestEvaluator forRowFilter(
Expression rowFilter, PartitionSpec spec, boolean caseSensitive) {
@@ -64,7 +68,15 @@ public static ManifestEvaluator forPartitionFilter(
}
private ManifestEvaluator(PartitionSpec spec, Expression partitionFilter, boolean caseSensitive) {
- this.expr = Binder.bind(spec.partitionType(), rewriteNot(partitionFilter), caseSensitive);
+ Types.StructType partitionType = spec.partitionType();
+ Expression rewritten = rewriteNot(partitionFilter);
+ this.expr = Binder.bind(partitionType, rewritten, caseSensitive);
+
+ // Create the signed UUID expression if and only if there are UUID predicates
+ // that compare against bounds.
+ Expression transformed = ExpressionUtil.toSignedUUIDLiteral(rewritten);
+ this.signedUuidExpr =
+ transformed != null ? Binder.bind(partitionType, transformed, caseSensitive) : null;
}
/**
@@ -74,22 +86,37 @@ private ManifestEvaluator(PartitionSpec spec, Expression partitionFilter, boolea
* @return false if the file cannot contain rows that match the expression, true otherwise.
*/
public boolean eval(ManifestFile manifest) {
- return new ManifestEvalVisitor().eval(manifest);
+ boolean result = new ManifestEvalVisitor().eval(manifest, expr, false);
+
+ // If the RFC-compliant evaluation says rows might match, or there's no signed UUID expression,
+ // return the result.
+ if (result || signedUuidExpr == null) {
+ return result;
+ }
+
+ // Always try with signed UUID comparator as a fallback. There is no reliable way to detect
+ // whether signed or unsigned comparator was used when the UUID column stats were written.
+ return new ManifestEvalVisitor().eval(manifest, signedUuidExpr, true);
}
private static final boolean ROWS_MIGHT_MATCH = true;
private static final boolean ROWS_CANNOT_MATCH = false;
- private class ManifestEvalVisitor extends BoundExpressionVisitor {
+ private static class ManifestEvalVisitor extends BoundExpressionVisitor {
private List stats = null;
+ // Flag to use signed UUID comparator for backward compatibility.
+ // This is specifically necessary for IN predicates because the comparator information
+ // is lost when binding converts literals to a Set of raw values.
+ private boolean useSignedUuidComparator = false;
- private boolean eval(ManifestFile manifest) {
+ private boolean eval(ManifestFile manifest, Expression expression, boolean signedUuidMode) {
this.stats = manifest.partitions();
+ this.useSignedUuidComparator = signedUuidMode;
if (stats == null) {
return ROWS_MIGHT_MATCH;
}
- return ExpressionVisitors.visitEvaluator(expr, this);
+ return ExpressionVisitors.visitEvaluator(expression, this);
}
@Override
@@ -295,20 +322,19 @@ public Boolean in(BoundReference ref, Set literalSet) {
return ROWS_MIGHT_MATCH;
}
+ // use an appropriate comparator, for UUIDs use the signed comparator if requested
+ Comparator cmp =
+ Comparators.comparatorFor(ref.type(), ref.comparator(), useSignedUuidComparator);
T lower = Conversions.fromByteBuffer(ref.type(), fieldStats.lowerBound());
literals =
- literals.stream()
- .filter(v -> ref.comparator().compare(lower, v) <= 0)
- .collect(Collectors.toList());
+ literals.stream().filter(v -> cmp.compare(lower, v) <= 0).collect(Collectors.toList());
if (literals.isEmpty()) { // if all values are less than lower bound, rows cannot match.
return ROWS_CANNOT_MATCH;
}
T upper = Conversions.fromByteBuffer(ref.type(), fieldStats.upperBound());
literals =
- literals.stream()
- .filter(v -> ref.comparator().compare(upper, v) >= 0)
- .collect(Collectors.toList());
+ literals.stream().filter(v -> cmp.compare(upper, v) >= 0).collect(Collectors.toList());
if (literals
.isEmpty()) { // if all remaining values are greater than upper bound, rows cannot match.
return ROWS_CANNOT_MATCH;
diff --git a/api/src/main/java/org/apache/iceberg/types/Comparators.java b/api/src/main/java/org/apache/iceberg/types/Comparators.java
index ab59c895686d..a1821eee7332 100644
--- a/api/src/main/java/org/apache/iceberg/types/Comparators.java
+++ b/api/src/main/java/org/apache/iceberg/types/Comparators.java
@@ -22,6 +22,7 @@
import java.util.Comparator;
import java.util.List;
import java.util.Map;
+import java.util.UUID;
import java.util.function.IntFunction;
import org.apache.iceberg.StructLike;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
@@ -46,7 +47,7 @@ private Comparators() {}
.put(Types.TimestampNanoType.withZone(), Comparator.naturalOrder())
.put(Types.TimestampNanoType.withoutZone(), Comparator.naturalOrder())
.put(Types.StringType.get(), Comparators.charSequences())
- .put(Types.UUIDType.get(), Comparator.naturalOrder())
+ .put(Types.UUIDType.get(), Comparators.uuids())
.put(Types.BinaryType.get(), Comparators.unsignedBytes())
.put(Types.UnknownType.get(), Comparator.nullsFirst(Comparator.naturalOrder()))
.buildOrThrow();
@@ -233,6 +234,42 @@ public static Comparator filePath() {
return FilePathComparator.INSTANCE;
}
+ public static Comparator uuids() {
+ return UUIDComparator.INSTANCE;
+ }
+
+ /**
+ * Returns a comparator that uses Java's native signed UUID comparison.
+ *
+ * This is provided for backward compatibility with older files that may have been written with
+ * min/max statistics computed using Java's signed {@link UUID#compareTo(UUID)}. New code should
+ * use {@link #uuids()} which provides RFC 4122/9562 compliant unsigned comparison.
+ *
+ * @return a comparator using signed UUID comparison
+ */
+ public static Comparator signedUUIDs() {
+ return Comparator.naturalOrder();
+ }
+
+ /**
+ * Returns the given comparator for a type or the signed UUID comparator, conditionally.
+ *
+ * @param type the Iceberg type
+ * @param defaultComparator the default comparator to use for non-UUID types or when signed UUID
+ * comparison is not requested
+ * @param useSignedUuid if true and the type is UUID, returns a signed UUID comparator
+ * @return the appropriate comparator
+ */
+ @SuppressWarnings("unchecked")
+ public static Comparator comparatorFor(
+ Type type, Comparator defaultComparator, boolean useSignedUuid) {
+ if (useSignedUuid && type.typeId() == Type.TypeID.UUID) {
+ return (Comparator) signedUUIDs();
+ }
+
+ return defaultComparator;
+ }
+
private static class NullsFirst implements Comparator {
private static final NullsFirst> INSTANCE = new NullsFirst<>();
@@ -448,4 +485,40 @@ public int compare(CharSequence s1, CharSequence s2) {
return 0;
}
}
+
+ /**
+ * Compares UUIDs using unsigned byte-wise comparison using big-endian byte-order compliant with
+ * RFC 4122 and RFC 9562. Java's UUID.compareTo() compares most significant bits first, then least
+ * significant bits using signed value comparisons, which is a known bug.
+ */
+ private static class UUIDComparator implements Comparator {
+ private static final UUIDComparator INSTANCE = new UUIDComparator();
+
+ private UUIDComparator() {}
+
+ @Override
+ public int compare(UUID uuid1, UUID uuid2) {
+ if (uuid1 == uuid2) {
+ return 0;
+ }
+
+ // Compare most significant bits first (bytes 0-7 in big-endian representation)
+ long msb1 = uuid1.getMostSignificantBits();
+ long msb2 = uuid2.getMostSignificantBits();
+
+ // Use unsigned comparison for the most significant bits
+ int msbCompare = Long.compareUnsigned(msb1, msb2);
+ if (msbCompare != 0) {
+ return msbCompare;
+ }
+
+ // If most significant bits are equal, compare least significant bits (bytes 8-15)
+ long lsb1 = uuid1.getLeastSignificantBits();
+ long lsb2 = uuid2.getLeastSignificantBits();
+
+ // Use unsigned comparison for the least significant bits
+ return Long.compareUnsigned(lsb1, lsb2);
+ }
+ }
}
diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java
index d9fe26eacc6b..6373e73a12ec 100644
--- a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java
+++ b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java
@@ -29,6 +29,7 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.UUID;
import java.util.regex.Pattern;
import java.util.stream.IntStream;
import org.apache.iceberg.PartitionSpec;
@@ -44,6 +45,7 @@
import org.apache.iceberg.variants.VariantPrimitive;
import org.apache.iceberg.variants.VariantTestUtil;
import org.apache.iceberg.variants.VariantValue;
+import org.assertj.core.api.InstanceOfAssertFactories;
import org.junit.jupiter.api.Test;
public class TestExpressionUtil {
@@ -1350,4 +1352,168 @@ private void assertEquals(UnboundTransform, ?> expected, UnboundTransform, ?
.hasToString(expected.transform().toString());
assertEquals(expected.ref(), actual.ref());
}
+
+ // Tests for UUID bounds predicate detection and transformation
+
+ @Test
+ public void testToSignedUUIDLiteralNoTransformForNonUuid() {
+ Expression original = Expressions.equal("id", 42L);
+ Expression result = ExpressionUtil.toSignedUUIDLiteral(original);
+
+ assertThat(result).as("Should return null for non-UUID predicate").isNull();
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralNoTransformForIsNull() {
+ Expression original = Expressions.isNull("uuid_col");
+ Expression result = ExpressionUtil.toSignedUUIDLiteral(original);
+
+ assertThat(result)
+ .as("Should return null for UUID isNull (doesn't compare against bounds)")
+ .isNull();
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralTransformsEqPredicate() {
+ UUID testUuid = UUID.randomUUID();
+ Expression original = Expressions.equal("uuid_col", testUuid);
+
+ // Verify the original literal uses unsigned comparator (useSignedComparator = false)
+ UnboundPredicate> originalPredicate = (UnboundPredicate>) original;
+ assertThat(originalPredicate.literal())
+ .isInstanceOf(Literals.UUIDLiteral.class)
+ .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class))
+ .extracting("useSignedComparator")
+ .isEqualTo(false);
+
+ Expression result = ExpressionUtil.toSignedUUIDLiteral(original);
+ assertThat(result).isNotNull().isInstanceOf(UnboundPredicate.class);
+ UnboundPredicate> predicate = (UnboundPredicate>) result;
+ assertThat(predicate.literal().value()).isEqualTo(testUuid);
+ // The literal should now use the signed comparator
+ assertThat(predicate.literal())
+ .isInstanceOf(Literals.UUIDLiteral.class)
+ .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class))
+ .extracting("useSignedComparator")
+ .isEqualTo(true);
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralTransformsLtPredicate() {
+ UUID testUuid = UUID.randomUUID();
+ Expression original = Expressions.lessThan("uuid_col", testUuid);
+
+ // Verify the original literal uses unsigned comparator (useSignedComparator = false)
+ UnboundPredicate> originalPredicate = (UnboundPredicate>) original;
+ assertThat(originalPredicate.literal())
+ .isInstanceOf(Literals.UUIDLiteral.class)
+ .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class))
+ .extracting("useSignedComparator")
+ .isEqualTo(false);
+
+ Expression result = ExpressionUtil.toSignedUUIDLiteral(original);
+ assertThat(result).isNotNull().isInstanceOf(UnboundPredicate.class);
+ UnboundPredicate> predicate = (UnboundPredicate>) result;
+ assertThat(predicate.literal())
+ .isInstanceOf(Literals.UUIDLiteral.class)
+ .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class))
+ .extracting("useSignedComparator")
+ .isEqualTo(true);
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralTransformsGtPredicate() {
+ UUID testUuid = UUID.randomUUID();
+ Expression original = Expressions.greaterThan("uuid_col", testUuid);
+
+ // Verify the original literal uses unsigned comparator (useSignedComparator = false)
+ UnboundPredicate> originalPredicate = (UnboundPredicate>) original;
+ assertThat(originalPredicate.literal())
+ .isInstanceOf(Literals.UUIDLiteral.class)
+ .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class))
+ .extracting("useSignedComparator")
+ .isEqualTo(false);
+
+ Expression result = ExpressionUtil.toSignedUUIDLiteral(original);
+ assertThat(result).isNotNull().isInstanceOf(UnboundPredicate.class);
+ UnboundPredicate> predicate = (UnboundPredicate>) result;
+ assertThat(predicate.literal())
+ .isInstanceOf(Literals.UUIDLiteral.class)
+ .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class))
+ .extracting("useSignedComparator")
+ .isEqualTo(true);
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralTransformsInPredicate() {
+ UUID uuid1 = UUID.randomUUID();
+ UUID uuid2 = UUID.randomUUID();
+ Expression original = Expressions.in("uuid_col", uuid1, uuid2);
+
+ // Verify the original literals use unsigned comparator (useSignedComparator = false)
+ UnboundPredicate> originalPredicate = (UnboundPredicate>) original;
+ originalPredicate
+ .literals()
+ .forEach(
+ lit -> {
+ assertThat(lit)
+ .isInstanceOf(Literals.UUIDLiteral.class)
+ .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class))
+ .extracting("useSignedComparator")
+ .isEqualTo(false);
+ });
+
+ Expression result = ExpressionUtil.toSignedUUIDLiteral(original);
+ assertThat(result).isNotNull().isInstanceOf(UnboundPredicate.class);
+ UnboundPredicate> predicate = (UnboundPredicate>) result;
+ assertThat(predicate.literals()).hasSize(2);
+ // All literals should use the signed comparator
+ predicate
+ .literals()
+ .forEach(
+ lit -> {
+ assertThat(lit)
+ .isInstanceOf(Literals.UUIDLiteral.class)
+ .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class))
+ .extracting("useSignedComparator")
+ .isEqualTo(true);
+ });
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralTransformsCompoundExpression() {
+ UUID testUuid = UUID.randomUUID();
+ Expression original =
+ Expressions.and(
+ Expressions.equal("id", 42L), Expressions.greaterThan("uuid_col", testUuid));
+
+ // Verify the original UUID literal uses unsigned comparator (useSignedComparator = false)
+ And originalAnd = (And) original;
+ UnboundPredicate> originalUuidPredicate = (UnboundPredicate>) originalAnd.right();
+ assertThat(originalUuidPredicate.literal())
+ .isInstanceOf(Literals.UUIDLiteral.class)
+ .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class))
+ .extracting("useSignedComparator")
+ .isEqualTo(false);
+
+ Expression result = ExpressionUtil.toSignedUUIDLiteral(original);
+ assertThat(result).isNotNull().isInstanceOf(And.class);
+ And and = (And) result;
+
+ // Verify the non-UUID predicate is unchanged
+ assertThat(and.left()).isInstanceOf(UnboundPredicate.class);
+ UnboundPredicate> leftPredicate = (UnboundPredicate>) and.left();
+ assertThat(leftPredicate.literal())
+ .isInstanceOf(Literals.LongLiteral.class)
+ .extracting("value")
+ .isEqualTo(42L);
+
+ // Verify the UUID predicate is transformed with signed comparator
+ assertThat(and.right()).isInstanceOf(UnboundPredicate.class);
+ UnboundPredicate> rightPredicate = (UnboundPredicate>) and.right();
+ assertThat(rightPredicate.literal())
+ .isInstanceOf(Literals.UUIDLiteral.class)
+ .extracting("useSignedComparator")
+ .isEqualTo(true);
+ }
}
diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java
index 78e6064eb427..f83b87d61aae 100644
--- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java
+++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java
@@ -42,6 +42,7 @@
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import java.nio.ByteBuffer;
+import java.util.UUID;
import org.apache.iceberg.ManifestFile;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
@@ -69,7 +70,8 @@ public class TestInclusiveManifestEvaluator {
optional(15, "no_nulls_same_value_a", Types.StringType.get()),
optional(16, "single_value_with_nan", Types.FloatType.get()),
optional(17, "single_value_nan_unknown", Types.FloatType.get()),
- optional(18, "single_value_no_nan", Types.FloatType.get()));
+ optional(18, "single_value_no_nan", Types.FloatType.get()),
+ optional(19, "uuid", Types.UUIDType.get()));
private static final PartitionSpec SPEC =
PartitionSpec.builderFor(SCHEMA)
@@ -90,6 +92,7 @@ public class TestInclusiveManifestEvaluator {
.identity("single_value_with_nan")
.identity("single_value_nan_unknown")
.identity("single_value_no_nan")
+ .identity("uuid")
.build();
private static final int INT_MIN_VALUE = 30;
@@ -101,6 +104,75 @@ public class TestInclusiveManifestEvaluator {
private static final ByteBuffer STRING_MIN = toByteBuffer(Types.StringType.get(), "a");
private static final ByteBuffer STRING_MAX = toByteBuffer(Types.StringType.get(), "z");
+ // UUID_MIN has all zeros in MSB, all ones in LSB: 00000000-0000-0000-ffff-ffffffffffff
+ // UUID_MAX has all ones in MSB, all zeros in LSB: ffffffff-ffff-ffff-0000-000000000000
+ // With unsigned byte-wise comparison (correct): UUID_MIN < UUID_MAX (0x00... < 0xFF...)
+ // With Java's natural order (incorrect): UUID_MIN > UUID_MAX (MSB 0 > MSB -1 as signed long)
+ private static final UUID UUID_MIN_VALUE =
+ UUID.fromString("00000000-0000-0000-ffff-ffffffffffff");
+ private static final UUID UUID_MAX_VALUE =
+ UUID.fromString("ffffffff-ffff-ffff-0000-000000000000");
+
+ private static final ByteBuffer UUID_MIN = toByteBuffer(Types.UUIDType.get(), UUID_MIN_VALUE);
+ private static final ByteBuffer UUID_MAX = toByteBuffer(Types.UUIDType.get(), UUID_MAX_VALUE);
+
+ // Legacy manifest with "inverted" bounds (min > max in unsigned order)
+ // Used for testing backward compatibility with files written using signed UUID comparator
+ private static final UUID LEGACY_UUID_MIN =
+ UUID.fromString("80000000-0000-0000-0000-000000000001");
+ private static final UUID LEGACY_UUID_MAX =
+ UUID.fromString("40000000-0000-0000-0000-000000000001");
+
+ private static final ManifestFile LEGACY_UUID_MANIFEST =
+ new TestHelpers.TestManifestFile(
+ "legacy-uuid-manifest.avro",
+ 1024,
+ 0,
+ System.currentTimeMillis(),
+ 5,
+ 10,
+ 0,
+ ImmutableList.of(
+ new TestHelpers.TestFieldSummary(false, INT_MIN, INT_MAX),
+ new TestHelpers.TestFieldSummary(true, null, null),
+ new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MAX),
+ new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MAX),
+ new TestHelpers.TestFieldSummary(
+ false,
+ toByteBuffer(Types.FloatType.get(), 0F),
+ toByteBuffer(Types.FloatType.get(), 20F)),
+ new TestHelpers.TestFieldSummary(true, null, null),
+ new TestHelpers.TestFieldSummary(true, false, null, null),
+ new TestHelpers.TestFieldSummary(false, true, null, null),
+ new TestHelpers.TestFieldSummary(true, true, null, null),
+ new TestHelpers.TestFieldSummary(
+ false,
+ false,
+ toByteBuffer(Types.FloatType.get(), 0F),
+ toByteBuffer(Types.FloatType.get(), 20F)),
+ new TestHelpers.TestFieldSummary(true, null, null),
+ new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MIN),
+ new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN),
+ new TestHelpers.TestFieldSummary(
+ false,
+ true,
+ toByteBuffer(Types.FloatType.get(), 5.0F),
+ toByteBuffer(Types.FloatType.get(), 5.0F)),
+ new TestHelpers.TestFieldSummary(
+ false,
+ toByteBuffer(Types.FloatType.get(), 5.0F),
+ toByteBuffer(Types.FloatType.get(), 5.0F)),
+ new TestHelpers.TestFieldSummary(
+ false,
+ false,
+ toByteBuffer(Types.FloatType.get(), 5.0F),
+ toByteBuffer(Types.FloatType.get(), 5.0F)),
+ new TestHelpers.TestFieldSummary(
+ false,
+ toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_MIN),
+ toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_MAX))),
+ null);
+
private static final ManifestFile NO_STATS =
new TestHelpers.TestManifestFile(
"manifest-list.avro", 1024, 0, System.currentTimeMillis(), null, null, null, null, null);
@@ -148,7 +220,8 @@ public class TestInclusiveManifestEvaluator {
false,
false,
toByteBuffer(Types.FloatType.get(), 5.0F),
- toByteBuffer(Types.FloatType.get(), 5.0F))),
+ toByteBuffer(Types.FloatType.get(), 5.0F)),
+ new TestHelpers.TestFieldSummary(false, UUID_MIN, UUID_MAX)),
null);
@Test
@@ -853,4 +926,264 @@ public void testNotInWithSingleValue() {
.as("Should not read: manifest contains single float value with no NaNs")
.isFalse();
}
+
+ @Test
+ public void testUuidEq() {
+ UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ boolean shouldRead =
+ ManifestEvaluator.forRowFilter(equal("uuid", belowMin), SPEC, true).eval(FILE);
+ assertThat(shouldRead).as("Should not read: uuid below lower bound").isFalse();
+
+ shouldRead =
+ ManifestEvaluator.forRowFilter(equal("uuid", UUID_MIN_VALUE), SPEC, true).eval(FILE);
+ assertThat(shouldRead).as("Should read: uuid equal to lower bound").isTrue();
+
+ UUID between = UUID.fromString("7fffffff-ffff-ffff-7fff-ffffffffffff");
+ shouldRead = ManifestEvaluator.forRowFilter(equal("uuid", between), SPEC, true).eval(FILE);
+ assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue();
+
+ shouldRead =
+ ManifestEvaluator.forRowFilter(equal("uuid", UUID_MAX_VALUE), SPEC, true).eval(FILE);
+ assertThat(shouldRead).as("Should read: uuid equal to upper bound").isTrue();
+
+ UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
+ shouldRead = ManifestEvaluator.forRowFilter(equal("uuid", aboveMax), SPEC, true).eval(FILE);
+ assertThat(shouldRead).as("Should not read: uuid above upper bound").isFalse();
+ }
+
+ @Test
+ public void testUuidLt() {
+ // With RFC comparison, belowMin is below the lower bound so no rows can be < belowMin.
+ // With signed comparison, UUID_MIN_VALUE.compareTo(belowMin) = -1 (lower < lit),
+ // so rows might match. We try both comparators and return true if either matches.
+ UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ boolean shouldRead =
+ ManifestEvaluator.forRowFilter(lessThan("uuid", belowMin), SPEC, true).eval(FILE);
+ assertThat(shouldRead)
+ .as("Should read: signed UUID fallback finds matches with inverted bounds")
+ .isTrue();
+
+ // UUID_MIN_VALUE is the lower bound, so no rows can be < UUID_MIN_VALUE.
+ // Both RFC and signed comparators agree on this since we're comparing the value to itself.
+ shouldRead =
+ ManifestEvaluator.forRowFilter(lessThan("uuid", UUID_MIN_VALUE), SPEC, true).eval(FILE);
+ assertThat(shouldRead)
+ .as("Should not read: uuid range below lower bound (UUID_MIN is not < UUID_MIN)")
+ .isFalse();
+
+ UUID justAboveMin = UUID.fromString("00000000-0000-0001-0000-000000000000");
+ shouldRead =
+ ManifestEvaluator.forRowFilter(lessThan("uuid", justAboveMin), SPEC, true).eval(FILE);
+ assertThat(shouldRead).as("Should read: one possible uuid").isTrue();
+
+ shouldRead =
+ ManifestEvaluator.forRowFilter(lessThan("uuid", UUID_MAX_VALUE), SPEC, true).eval(FILE);
+ assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue();
+
+ UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
+ shouldRead = ManifestEvaluator.forRowFilter(lessThan("uuid", aboveMax), SPEC, true).eval(FILE);
+ assertThat(shouldRead).as("Should read: all uuids in range").isTrue();
+ }
+
+ @Test
+ public void testUuidLtEq() {
+ // With RFC comparison, belowMin is below the lower bound so no rows can be <= belowMin.
+ // With signed comparison, the bounds are inverted, so rows might match.
+ UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ boolean shouldRead =
+ ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", belowMin), SPEC, true).eval(FILE);
+ assertThat(shouldRead)
+ .as("Should read: signed UUID fallback finds matches with inverted bounds")
+ .isTrue();
+
+ shouldRead =
+ ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", UUID_MIN_VALUE), SPEC, true)
+ .eval(FILE);
+ assertThat(shouldRead).as("Should read: one possible uuid").isTrue();
+
+ shouldRead =
+ ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", UUID_MAX_VALUE), SPEC, true)
+ .eval(FILE);
+ assertThat(shouldRead).as("Should read: all uuids in range").isTrue();
+
+ UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
+ shouldRead =
+ ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", aboveMax), SPEC, true).eval(FILE);
+ assertThat(shouldRead).as("Should read: all uuids in range").isTrue();
+ }
+
+ @Test
+ public void testUuidGt() {
+ UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ boolean shouldRead =
+ ManifestEvaluator.forRowFilter(greaterThan("uuid", belowMin), SPEC, true).eval(FILE);
+ assertThat(shouldRead).as("Should read: all uuids in range").isTrue();
+
+ shouldRead =
+ ManifestEvaluator.forRowFilter(greaterThan("uuid", UUID_MIN_VALUE), SPEC, true).eval(FILE);
+ assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue();
+
+ UUID justBelowMax = UUID.fromString("ffffffff-ffff-fffe-ffff-ffffffffffff");
+ shouldRead =
+ ManifestEvaluator.forRowFilter(greaterThan("uuid", justBelowMax), SPEC, true).eval(FILE);
+ assertThat(shouldRead)
+ .as("Should read: uuid between justBelowMax and the upper bound")
+ .isTrue();
+
+ // UUID_MAX_VALUE is the upper bound, so no rows can be > UUID_MAX_VALUE.
+ // Both RFC and signed comparators agree on this since we're comparing the value to itself.
+ shouldRead =
+ ManifestEvaluator.forRowFilter(greaterThan("uuid", UUID_MAX_VALUE), SPEC, true).eval(FILE);
+ assertThat(shouldRead)
+ .as("Should not read: uuid range above upper bound (UUID_MAX is not > UUID_MAX)")
+ .isFalse();
+
+ // With RFC comparison, aboveMax is above the upper bound so no rows can be > aboveMax.
+ // With signed comparison, UUID_MAX_VALUE.compareTo(aboveMax) = 1 (upper > lit),
+ // so rows might match. We try both comparators and return true if either matches.
+ UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
+ shouldRead =
+ ManifestEvaluator.forRowFilter(greaterThan("uuid", aboveMax), SPEC, true).eval(FILE);
+ assertThat(shouldRead)
+ .as("Should read: signed UUID fallback finds matches with inverted bounds")
+ .isTrue();
+ }
+
+ @Test
+ public void testUuidGtEq() {
+ UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ boolean shouldRead =
+ ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", belowMin), SPEC, true).eval(FILE);
+ assertThat(shouldRead).as("Should read: all uuids in range").isTrue();
+
+ shouldRead =
+ ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", UUID_MIN_VALUE), SPEC, true)
+ .eval(FILE);
+ assertThat(shouldRead).as("Should read: all uuids in range").isTrue();
+
+ shouldRead =
+ ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", UUID_MAX_VALUE), SPEC, true)
+ .eval(FILE);
+ assertThat(shouldRead).as("Should read: one possible uuid").isTrue();
+
+ // With RFC comparison, aboveMax is above the upper bound so no rows can be >= aboveMax.
+ // With signed comparison, UUID_MAX_VALUE.compareTo(aboveMax) = 1 (upper > lit),
+ // so rows might match. We try both comparators and return true if either matches.
+ UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
+ shouldRead =
+ ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", aboveMax), SPEC, true).eval(FILE);
+ assertThat(shouldRead)
+ .as("Should read: signed UUID fallback finds matches with inverted bounds")
+ .isTrue();
+ }
+
+ @Test
+ public void testUuidIn() {
+ UUID belowMin1 = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ UUID belowMin2 = UUID.fromString("00000000-0000-0000-0000-000000000001");
+ boolean shouldRead =
+ ManifestEvaluator.forRowFilter(in("uuid", belowMin1, belowMin2), SPEC, true).eval(FILE);
+ assertThat(shouldRead).as("Should not read: uuids below lower bound").isFalse();
+
+ shouldRead =
+ ManifestEvaluator.forRowFilter(in("uuid", belowMin1, UUID_MIN_VALUE), SPEC, true)
+ .eval(FILE);
+ assertThat(shouldRead).as("Should read: uuid equal to lower bound").isTrue();
+
+ UUID middle1 = UUID.fromString("7fffffff-ffff-ffff-0000-000000000000");
+ UUID middle2 = UUID.fromString("7fffffff-ffff-ffff-ffff-ffffffffffff");
+ shouldRead =
+ ManifestEvaluator.forRowFilter(in("uuid", middle1, middle2), SPEC, true).eval(FILE);
+ assertThat(shouldRead).as("Should read: uuids between lower and upper bounds").isTrue();
+
+ UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
+ shouldRead =
+ ManifestEvaluator.forRowFilter(in("uuid", UUID_MAX_VALUE, aboveMax), SPEC, true).eval(FILE);
+ assertThat(shouldRead).as("Should read: uuid equal to upper bound").isTrue();
+
+ UUID aboveMax2 = UUID.fromString("ffffffff-ffff-ffff-ffff-fffffffffffe");
+ shouldRead =
+ ManifestEvaluator.forRowFilter(in("uuid", aboveMax, aboveMax2), SPEC, true).eval(FILE);
+ assertThat(shouldRead).as("Should not read: uuids above upper bound").isFalse();
+ }
+
+ // Tests for legacy UUID file compatibility (files written with signed UUID comparator).
+ // These tests simulate manifests where min/max bounds were computed using Java's signed
+ // comparison.
+ //
+ // Java's UUID.compareTo() compares MSB first, then LSB, both as SIGNED longs:
+ // - UUIDs starting with 0x00-0x7F have positive MSB (as signed long)
+ // - UUIDs starting with 0x80-0xFF have negative MSB (as signed long)
+ //
+ // For a file containing UUIDs 0x00..., 0x40..., 0x80...:
+ // - Unsigned (RFC) order: 0x00... < 0x40... < 0x80...
+ // - Signed (Java) order: 0x80... < 0x00... < 0x40...
+ //
+ // With signed comparator, the manifest has min=0x80..., max=0x40... (inverted bounds).
+
+ @Test
+ public void testLegacyUuidManifestIn() {
+ // 0x20... and 0x30... are between 0x80... and 0x40... in signed order.
+ // RFC unsigned filters them out (both < 0x80...), but legacy signed includes them.
+ UUID uuid1 = UUID.fromString("20000000-0000-0000-0000-000000000001");
+ UUID uuid2 = UUID.fromString("30000000-0000-0000-0000-000000000001");
+ boolean shouldRead =
+ ManifestEvaluator.forRowFilter(in("uuid", uuid1, uuid2), SPEC, true)
+ .eval(LEGACY_UUID_MANIFEST);
+ assertThat(shouldRead)
+ .as("Should read: legacy fallback should find UUIDs in signed-ordered bounds")
+ .isTrue();
+ }
+
+ @Test
+ public void testLegacyUuidManifestInAllBelowLowerBound() {
+ // 0x50... and 0x60... are outside bounds in both orderings:
+ // - RFC unsigned: both < 0x80... (lower bound)
+ // - Signed: both > 0x40... (upper bound)
+ UUID uuid1 = UUID.fromString("50000000-0000-0000-0000-000000000001");
+ UUID uuid2 = UUID.fromString("60000000-0000-0000-0000-000000000001");
+ boolean shouldRead =
+ ManifestEvaluator.forRowFilter(in("uuid", uuid1, uuid2), SPEC, true)
+ .eval(LEGACY_UUID_MANIFEST);
+ assertThat(shouldRead)
+ .as("Should not read: UUIDs outside bounds in both RFC and signed order")
+ .isFalse();
+ }
+
+ @Test
+ public void testLegacyUuidManifestEq() {
+ // 0x20... is between 0x80... and 0x40... in signed order (within the manifest's actual range).
+ // RFC unsigned fails (0x20... < 0x80...), but legacy signed succeeds.
+ UUID queryUuid = UUID.fromString("20000000-0000-0000-0000-000000000001");
+ boolean shouldRead =
+ ManifestEvaluator.forRowFilter(equal("uuid", queryUuid), SPEC, true)
+ .eval(LEGACY_UUID_MANIFEST);
+ assertThat(shouldRead)
+ .as("Should read: legacy fallback should find UUID in signed-ordered bounds")
+ .isTrue();
+ }
+
+ @Test
+ public void testLegacyUuidManifestLt() {
+ // In signed order, 0x80... < 0x30..., so the manifest's min satisfies uuid < 0x30...
+ UUID queryUuid = UUID.fromString("30000000-0000-0000-0000-000000000001");
+ boolean shouldRead =
+ ManifestEvaluator.forRowFilter(lessThan("uuid", queryUuid), SPEC, true)
+ .eval(LEGACY_UUID_MANIFEST);
+ assertThat(shouldRead)
+ .as("Should read: legacy fallback should find UUIDs less than query in signed order")
+ .isTrue();
+ }
+
+ @Test
+ public void testLegacyUuidManifestGt() {
+ // In signed order, 0x40... > 0x20..., so the manifest's max satisfies uuid > 0x20...
+ UUID queryUuid = UUID.fromString("20000000-0000-0000-0000-000000000001");
+ boolean shouldRead =
+ ManifestEvaluator.forRowFilter(greaterThan("uuid", queryUuid), SPEC, true)
+ .eval(LEGACY_UUID_MANIFEST);
+ assertThat(shouldRead)
+ .as("Should read: legacy fallback should find UUIDs greater than query in signed order")
+ .isTrue();
+ }
}
diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java
index 5f0ca2659fbf..26ae8ba08ec6 100644
--- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java
+++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java
@@ -42,6 +42,7 @@
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import java.util.List;
+import java.util.UUID;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.Schema;
import org.apache.iceberg.TestHelpers.Row;
@@ -71,7 +72,8 @@ public class TestInclusiveMetricsEvaluator {
optional(11, "all_nans_v1_stats", Types.FloatType.get()),
optional(12, "nan_and_null_only", Types.DoubleType.get()),
optional(13, "no_nan_stats", Types.DoubleType.get()),
- optional(14, "some_empty", Types.StringType.get()));
+ optional(14, "some_empty", Types.StringType.get()),
+ optional(15, "uuid", Types.UUIDType.get()));
private static final Schema NESTED_SCHEMA =
new Schema(
@@ -91,6 +93,38 @@ public class TestInclusiveMetricsEvaluator {
private static final int INT_MIN_VALUE = 30;
private static final int INT_MAX_VALUE = 79;
+ // UUIDs that demonstrate the difference between Java's natural order and byte-order comparison
+ // UUID_MIN has all zeros in MSB, all ones in LSB: 00000000-0000-0000-ffff-ffffffffffff
+ // UUID_MAX has all ones in MSB, all zeros in LSB: ffffffff-ffff-ffff-0000-000000000000
+ // With byte-order comparison (correct): UUID_MIN < UUID_MAX (0x00... < 0xFF...)
+ // With Java's natural order (incorrect): UUID_MIN > UUID_MAX (MSB 0 > MSB -1 as signed long)
+ private static final UUID UUID_MIN_VALUE =
+ UUID.fromString("00000000-0000-0000-ffff-ffffffffffff");
+ private static final UUID UUID_MAX_VALUE =
+ UUID.fromString("ffffffff-ffff-ffff-0000-000000000000");
+
+ // Legacy file with "inverted" bounds (min > max in unsigned order)
+ // Used for testing backward compatibility with files written using signed UUID comparator
+ // File contains UUIDs from 0x80... to 0x40... (in signed order)
+ // This represents actual values: 0x80..., 0x00..., 0x20..., 0x40...
+ private static final UUID LEGACY_SIGNED_MIN =
+ UUID.fromString("80000000-0000-0000-0000-000000000001"); // a small value in signed order
+ private static final UUID LEGACY_SIGNED_MAX =
+ UUID.fromString("40000000-0000-0000-0000-000000000001"); // a large value in signed order
+
+ private static final DataFile LEGACY_UUID_FILE =
+ new TestDataFile(
+ "legacy_uuid_file.avro",
+ Row.of(),
+ 50,
+ ImmutableMap.of(15, 50L),
+ ImmutableMap.of(15, 0L),
+ null,
+ // lower bound: 0x80... (smallest in signed comparison)
+ ImmutableMap.of(15, toByteBuffer(Types.UUIDType.get(), LEGACY_SIGNED_MIN)),
+ // upper bound: 0x40... (largest in signed comparison)
+ ImmutableMap.of(15, toByteBuffer(Types.UUIDType.get(), LEGACY_SIGNED_MAX)));
+
private static final DataFile FILE =
new TestDataFile(
"file.avro",
@@ -109,6 +143,7 @@ public class TestInclusiveMetricsEvaluator {
.put(12, 50L)
.put(13, 50L)
.put(14, 50L)
+ .put(15, 50L)
.buildOrThrow(),
// null value counts
ImmutableMap.builder()
@@ -119,6 +154,7 @@ public class TestInclusiveMetricsEvaluator {
.put(11, 0L)
.put(12, 1L)
.put(14, 0L)
+ .put(15, 0L)
.buildOrThrow(),
// nan value counts
ImmutableMap.of(
@@ -130,13 +166,15 @@ public class TestInclusiveMetricsEvaluator {
1, toByteBuffer(IntegerType.get(), INT_MIN_VALUE),
11, toByteBuffer(Types.FloatType.get(), Float.NaN),
12, toByteBuffer(Types.DoubleType.get(), Double.NaN),
- 14, toByteBuffer(Types.StringType.get(), "")),
+ 14, toByteBuffer(Types.StringType.get(), ""),
+ 15, toByteBuffer(Types.UUIDType.get(), UUID_MIN_VALUE)),
// upper bounds
ImmutableMap.of(
1, toByteBuffer(IntegerType.get(), INT_MAX_VALUE),
11, toByteBuffer(Types.FloatType.get(), Float.NaN),
12, toByteBuffer(Types.DoubleType.get(), Double.NaN),
- 14, toByteBuffer(Types.StringType.get(), "房东整租霍营小区二层两居室")));
+ 14, toByteBuffer(Types.StringType.get(), "房东整租霍营小区二层两居室"),
+ 15, toByteBuffer(Types.UUIDType.get(), UUID_MAX_VALUE)));
private static final DataFile FILE_2 =
new TestDataFile(
@@ -1138,4 +1176,287 @@ public void testNotInWithSingleValue() {
.as("Should read: file has NaN values which match NOT IN predicate")
.isTrue();
}
+
+ @Test
+ public void testUuidEq() {
+ UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", belowMin)).eval(FILE);
+ assertThat(shouldRead).as("Should not read: uuid below lower bound").isFalse();
+
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", UUID_MIN_VALUE)).eval(FILE);
+ assertThat(shouldRead).as("Should read: uuid equal to lower bound").isTrue();
+
+ UUID between = UUID.fromString("7fffffff-ffff-ffff-7fff-ffffffffffff");
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", between)).eval(FILE);
+ assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue();
+
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", UUID_MAX_VALUE)).eval(FILE);
+ assertThat(shouldRead).as("Should read: uuid equal to upper bound").isTrue();
+
+ UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", aboveMax)).eval(FILE);
+ assertThat(shouldRead).as("Should not read: uuid above upper bound").isFalse();
+ }
+
+ @Test
+ public void testUuidLt() {
+ // With RFC comparison, belowMin is below the lower bound so no rows can be < belowMin.
+ // With signed comparison, UUID_MIN_VALUE.compareTo(belowMin) = -1 (lower < lit),
+ // so rows might match. We try both comparators and return true if either matches.
+ UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ boolean shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", belowMin)).eval(FILE);
+ assertThat(shouldRead)
+ .as("Should read: signed UUID fallback finds matches with inverted bounds")
+ .isTrue();
+
+ // UUID_MIN_VALUE is the lower bound, so no rows can be < UUID_MIN_VALUE.
+ // Both RFC and signed comparators agree on this since we're comparing the value to itself.
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", UUID_MIN_VALUE)).eval(FILE);
+ assertThat(shouldRead)
+ .as("Should not read: uuid range below lower bound (UUID_MIN is not < UUID_MIN)")
+ .isFalse();
+
+ UUID justAboveMin = UUID.fromString("00000000-0000-0001-0000-000000000000");
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", justAboveMin)).eval(FILE);
+ assertThat(shouldRead).as("Should read: one possible uuid").isTrue();
+
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", UUID_MAX_VALUE)).eval(FILE);
+ assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue();
+
+ UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", aboveMax)).eval(FILE);
+ assertThat(shouldRead).as("Should read: all uuids in range").isTrue();
+ }
+
+ @Test
+ public void testUuidLtEq() {
+ // With RFC comparison, belowMin is below the lower bound so no rows can be <= belowMin.
+ // However, we also try signed comparison for backward compatibility, and with signed comparison
+ // the bounds are inverted, so rows might match.
+ UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ boolean shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("uuid", belowMin)).eval(FILE);
+ assertThat(shouldRead)
+ .as("Should read: signed UUID fallback may find matches with inverted bounds")
+ .isTrue();
+
+ shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("uuid", UUID_MIN_VALUE)).eval(FILE);
+ assertThat(shouldRead).as("Should read: one possible uuid").isTrue();
+
+ shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("uuid", UUID_MAX_VALUE)).eval(FILE);
+ assertThat(shouldRead).as("Should read: all uuids in range").isTrue();
+
+ UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
+ shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("uuid", aboveMax)).eval(FILE);
+ assertThat(shouldRead).as("Should read: all uuids in range").isTrue();
+ }
+
+ @Test
+ public void testUuidGt() {
+ UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ boolean shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", belowMin)).eval(FILE);
+ assertThat(shouldRead).as("Should read: all uuids in range").isTrue();
+
+ shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", UUID_MIN_VALUE)).eval(FILE);
+ assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue();
+
+ UUID justBelowMax = UUID.fromString("ffffffff-ffff-fffe-ffff-ffffffffffff");
+ shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", justBelowMax)).eval(FILE);
+ assertThat(shouldRead).as("Should read: one possible uuid").isTrue();
+
+ // UUID_MAX_VALUE is the upper bound, so no rows can be > UUID_MAX_VALUE.
+ // Both RFC and signed comparators agree on this since we're comparing the value to itself.
+ shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", UUID_MAX_VALUE)).eval(FILE);
+ assertThat(shouldRead)
+ .as("Should not read: uuid range above upper bound (UUID_MAX is not > UUID_MAX)")
+ .isFalse();
+
+ // With RFC comparison, aboveMax is above the upper bound so no rows can be > aboveMax.
+ // With signed comparison, UUID_MAX_VALUE.compareTo(aboveMax) = 1 (upper > lit),
+ // so rows might match. We try both comparators and return true if either matches.
+ UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", aboveMax)).eval(FILE);
+ assertThat(shouldRead)
+ .as("Should read: signed UUID fallback finds matches with inverted bounds")
+ .isTrue();
+ }
+
+ @Test
+ public void testUuidGtEq() {
+ UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ boolean shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("uuid", belowMin)).eval(FILE);
+ assertThat(shouldRead).as("Should read: all uuids in range").isTrue();
+
+ shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("uuid", UUID_MIN_VALUE))
+ .eval(FILE);
+ assertThat(shouldRead).as("Should read: all uuids in range").isTrue();
+
+ shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("uuid", UUID_MAX_VALUE))
+ .eval(FILE);
+ assertThat(shouldRead).as("Should read: one possible uuid").isTrue();
+
+ // With RFC comparison, aboveMax is above the upper bound so no rows can be >= aboveMax.
+ // With signed comparison, UUID_MAX_VALUE.compareTo(aboveMax) = 1 (upper > lit),
+ // so rows might match. We try both comparators and return true if either matches.
+ UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
+ shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("uuid", aboveMax)).eval(FILE);
+ assertThat(shouldRead)
+ .as("Should read: signed UUID fallback finds matches with inverted bounds")
+ .isTrue();
+ }
+
+ @Test
+ public void testUuidIn() {
+ UUID belowMin1 = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ UUID belowMin2 = UUID.fromString("00000000-0000-0000-0000-000000000001");
+ boolean shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, in("uuid", belowMin1, belowMin2)).eval(FILE);
+ assertThat(shouldRead).as("Should not read: uuids below lower bound").isFalse();
+
+ shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, in("uuid", belowMin1, UUID_MIN_VALUE)).eval(FILE);
+ assertThat(shouldRead).as("Should read: uuid equal to lower bound").isTrue();
+
+ UUID middle1 = UUID.fromString("7fffffff-ffff-ffff-0000-000000000000");
+ UUID middle2 = UUID.fromString("7fffffff-ffff-ffff-ffff-ffffffffffff");
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, in("uuid", middle1, middle2)).eval(FILE);
+ assertThat(shouldRead).as("Should read: uuids between lower and upper bounds").isTrue();
+
+ UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
+ shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, in("uuid", UUID_MAX_VALUE, aboveMax)).eval(FILE);
+ assertThat(shouldRead).as("Should read: uuid equal to upper bound").isTrue();
+
+ UUID aboveMax2 = UUID.fromString("ffffffff-ffff-ffff-ffff-fffffffffffe");
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, in("uuid", aboveMax, aboveMax2)).eval(FILE);
+ assertThat(shouldRead).as("Should not read: uuids above upper bound").isFalse();
+ }
+
+ @Test
+ public void testUuidNotEq() {
+ UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ boolean shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, notEqual("uuid", belowMin)).eval(FILE);
+ assertThat(shouldRead).as("Should read: notEqual always reads").isTrue();
+
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("uuid", UUID_MIN_VALUE)).eval(FILE);
+ assertThat(shouldRead).as("Should read: notEqual always reads").isTrue();
+
+ UUID middle = UUID.fromString("7fffffff-ffff-ffff-7fff-ffffffffffff");
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("uuid", middle)).eval(FILE);
+ assertThat(shouldRead).as("Should read: notEqual always reads").isTrue();
+
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("uuid", UUID_MAX_VALUE)).eval(FILE);
+ assertThat(shouldRead).as("Should read: notEqual always reads").isTrue();
+
+ UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("uuid", aboveMax)).eval(FILE);
+ assertThat(shouldRead).as("Should read: notEqual always reads").isTrue();
+ }
+
+ @Test
+ public void testUuidNotIn() {
+ UUID belowMin1 = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ UUID belowMin2 = UUID.fromString("00000000-0000-0000-0000-000000000001");
+ boolean shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, notIn("uuid", belowMin1, belowMin2)).eval(FILE);
+ assertThat(shouldRead).as("Should read: notIn always reads").isTrue();
+
+ shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, notIn("uuid", UUID_MIN_VALUE, UUID_MAX_VALUE))
+ .eval(FILE);
+ assertThat(shouldRead).as("Should read: notIn always reads").isTrue();
+
+ UUID middle1 = UUID.fromString("7fffffff-ffff-ffff-0000-000000000000");
+ UUID middle2 = UUID.fromString("7fffffff-ffff-ffff-ffff-ffffffffffff");
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notIn("uuid", middle1, middle2)).eval(FILE);
+ assertThat(shouldRead).as("Should read: notIn always reads").isTrue();
+ }
+
+ // Tests for legacy UUID file compatibility (files written with signed UUID comparator).
+ // These tests simulate files where min/max bounds were computed using Java's signed comparison.
+ //
+ // Java's UUID.compareTo() compares MSB first, then LSB, both as SIGNED longs:
+ // - UUIDs starting with 0x00-0x7F have positive MSB (as signed long)
+ // - UUIDs starting with 0x80-0xFF have negative MSB (as signed long)
+ //
+ // For a file containing UUIDs 0x00..., 0x40..., 0x80...:
+ // - Unsigned (RFC) order: 0x00... < 0x40... < 0x80...
+ // - Signed (Java) order: 0x80... < 0x00... < 0x40...
+ //
+ // With signed comparator, the file has min=0x80..., max=0x40... (inverted bounds).
+ //
+ // Querying for 0x20... (which exists in the file):
+ // - RFC unsigned: 0x20... < 0x80... (lower bound), so incorrectly skipped
+ // - Legacy signed: 0x80... < 0x20... < 0x40..., so correctly included
+
+ @Test
+ public void testLegacyUuidFileEq() {
+ // 0x20... is between 0x80... and 0x40... in signed order (within the file's actual range).
+ // RFC unsigned fails (0x20... < 0x80...), but legacy signed succeeds.
+ UUID queryUuid = UUID.fromString("20000000-0000-0000-0000-000000000001");
+ boolean shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", queryUuid)).eval(LEGACY_UUID_FILE);
+ assertThat(shouldRead)
+ .as("Should read: legacy fallback should find UUID in signed-ordered bounds")
+ .isTrue();
+ }
+
+ @Test
+ public void testLegacyUuidFileLt() {
+ // In signed order, 0x80... < 0x30..., so the file's min satisfies uuid < 0x30...
+ UUID queryUuid = UUID.fromString("30000000-0000-0000-0000-000000000001");
+ boolean shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", queryUuid)).eval(LEGACY_UUID_FILE);
+ assertThat(shouldRead)
+ .as("Should read: legacy fallback should find UUIDs less than query in signed order")
+ .isTrue();
+ }
+
+ @Test
+ public void testLegacyUuidFileGt() {
+ // In signed order, 0x40... > 0x20..., so the file's max satisfies uuid > 0x20...
+ UUID queryUuid = UUID.fromString("20000000-0000-0000-0000-000000000001");
+ boolean shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", queryUuid))
+ .eval(LEGACY_UUID_FILE);
+ assertThat(shouldRead)
+ .as("Should read: legacy fallback should find UUIDs greater than query in signed order")
+ .isTrue();
+ }
+
+ @Test
+ public void testLegacyUuidFileIn() {
+ // 0x20... and 0x30... are between 0x80... and 0x40... in signed order.
+ // RFC unsigned filters them out (both < 0x80...), but legacy signed includes them.
+ UUID uuid1 = UUID.fromString("20000000-0000-0000-0000-000000000001");
+ UUID uuid2 = UUID.fromString("30000000-0000-0000-0000-000000000001");
+ boolean shouldRead =
+ new InclusiveMetricsEvaluator(SCHEMA, in("uuid", uuid1, uuid2)).eval(LEGACY_UUID_FILE);
+ assertThat(shouldRead)
+ .as("Should read: legacy fallback should find UUIDs in signed-ordered bounds")
+ .isTrue();
+ }
+
+ @Test
+ public void testNonUuidPredicateNoLegacyFallback() {
+ // Non-UUID predicates should not trigger legacy fallback (no performance impact).
+ boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("id", 50)).eval(FILE);
+ assertThat(shouldRead).as("Should read: id 50 is in range [30, 79]").isTrue();
+
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("id", 100)).eval(FILE);
+ assertThat(shouldRead).as("Should not read: id 100 is above range [30, 79]").isFalse();
+ }
}
diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java
index f34cd730df77..0c243b2f17a5 100644
--- a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java
+++ b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java
@@ -39,6 +39,7 @@
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
+import java.util.UUID;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.Schema;
import org.apache.iceberg.TestHelpers.Row;
@@ -73,7 +74,8 @@ public class TestStrictMetricsEvaluator {
Types.StructType.of(
Types.NestedField.optional(16, "nested_col_no_stats", Types.IntegerType.get()),
Types.NestedField.optional(
- 17, "nested_col_with_stats", Types.IntegerType.get()))));
+ 17, "nested_col_with_stats", Types.IntegerType.get()))),
+ optional(18, "uuid", Types.UUIDType.get()));
private static final int INT_MIN_VALUE = 30;
private static final int INT_MAX_VALUE = 79;
@@ -172,6 +174,53 @@ public class TestStrictMetricsEvaluator {
// upper bounds
ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb")));
+ // UUIDs for testing dual-comparator behavior with high-bit values
+ // These UUIDs span the signed/unsigned comparison boundary
+ private static final UUID UUID_MIN = UUID.fromString("00000000-0000-0000-0000-000000000001");
+ private static final UUID UUID_MAX = UUID.fromString("80000000-0000-0000-0000-000000000001");
+
+ private static final DataFile UUID_FILE =
+ new TestDataFile(
+ "uuid_file.avro",
+ Row.of(),
+ 50,
+ ImmutableMap.of(18, 50L),
+ ImmutableMap.of(18, 0L),
+ null,
+ ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), UUID_MIN)),
+ ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), UUID_MAX)));
+
+ // File with single UUID value (lower == upper)
+ private static final UUID SINGLE_UUID = UUID.fromString("40000000-0000-0000-0000-000000000001");
+ private static final DataFile SINGLE_UUID_FILE =
+ new TestDataFile(
+ "single_uuid_file.avro",
+ Row.of(),
+ 50,
+ ImmutableMap.of(18, 50L),
+ ImmutableMap.of(18, 0L),
+ null,
+ ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), SINGLE_UUID)),
+ ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), SINGLE_UUID)));
+
+ // File with inverted UUID bounds (as would be written by legacy signed comparator)
+ // In RFC unsigned order: 0x40... < 0x80..., so lower > upper when interpreted with RFC
+ private static final UUID LEGACY_UUID_LOWER =
+ UUID.fromString("80000000-0000-0000-0000-000000000001");
+ private static final UUID LEGACY_UUID_UPPER =
+ UUID.fromString("40000000-0000-0000-0000-000000000001");
+
+ private static final DataFile LEGACY_UUID_FILE =
+ new TestDataFile(
+ "legacy_uuid_file.avro",
+ Row.of(),
+ 50,
+ ImmutableMap.of(18, 50L),
+ ImmutableMap.of(18, 0L),
+ null,
+ ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_LOWER)),
+ ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_UPPER)));
+
@Test
public void testAllNulls() {
boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notNull("all_nulls")).eval(FILE);
@@ -684,4 +733,146 @@ SCHEMA, lessThanOrEqual("struct.nested_col_with_stats", INT_MAX_VALUE))
new StrictMetricsEvaluator(SCHEMA, notNull("struct.nested_col_with_stats")).eval(FILE);
assertThat(shouldRead).as("notNull nested column should not match").isFalse();
}
+
+ // Tests for UUID with StrictMetricsEvaluator using RFC-compliant comparison only
+ // UUID_FILE has bounds [UUID_MIN=0x00...01, UUID_MAX=0x80...01]
+
+ @Test
+ public void testStrictUuidGt() {
+ // Query: uuid > belowMin (all UUIDs in file should be > this)
+ UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ boolean allMatch =
+ new StrictMetricsEvaluator(SCHEMA, greaterThan("uuid", belowMin)).eval(UUID_FILE);
+ assertThat(allMatch).as("All UUIDs should be greater than belowMin").isTrue();
+
+ // Query: uuid > UUID_MIN (lower bound) - not all values are > lower bound
+ allMatch = new StrictMetricsEvaluator(SCHEMA, greaterThan("uuid", UUID_MIN)).eval(UUID_FILE);
+ assertThat(allMatch).as("Not all UUIDs are greater than lower bound").isFalse();
+
+ // Query: uuid > middle value - not all values are > middle
+ UUID middle = UUID.fromString("40000000-0000-0000-0000-000000000001");
+ allMatch = new StrictMetricsEvaluator(SCHEMA, greaterThan("uuid", middle)).eval(UUID_FILE);
+ assertThat(allMatch).as("Not all UUIDs are greater than middle").isFalse();
+
+ // Query: uuid > UUID_MAX (upper bound) - no values are > upper bound
+ allMatch = new StrictMetricsEvaluator(SCHEMA, greaterThan("uuid", UUID_MAX)).eval(UUID_FILE);
+ assertThat(allMatch).as("No UUIDs are greater than upper bound").isFalse();
+
+ // Query: uuid > aboveMax - no values are > aboveMax
+ UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
+ allMatch = new StrictMetricsEvaluator(SCHEMA, greaterThan("uuid", aboveMax)).eval(UUID_FILE);
+ assertThat(allMatch).as("No UUIDs are greater than aboveMax").isFalse();
+ }
+
+ @Test
+ public void testStrictUuidLt() {
+ // Query: uuid < belowMin - no values are < belowMin
+ UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ boolean allMatch =
+ new StrictMetricsEvaluator(SCHEMA, lessThan("uuid", belowMin)).eval(UUID_FILE);
+ assertThat(allMatch).as("No UUIDs are less than belowMin").isFalse();
+
+ // Query: uuid < UUID_MIN (lower bound) - no values are < lower bound
+ allMatch = new StrictMetricsEvaluator(SCHEMA, lessThan("uuid", UUID_MIN)).eval(UUID_FILE);
+ assertThat(allMatch).as("No UUIDs are less than lower bound").isFalse();
+
+ // Query: uuid < middle value - not all values are < middle
+ UUID middle = UUID.fromString("40000000-0000-0000-0000-000000000001");
+ allMatch = new StrictMetricsEvaluator(SCHEMA, lessThan("uuid", middle)).eval(UUID_FILE);
+ assertThat(allMatch).as("Not all UUIDs are less than middle").isFalse();
+
+ // Query: uuid < UUID_MAX (upper bound) - not all values are < upper bound
+ allMatch = new StrictMetricsEvaluator(SCHEMA, lessThan("uuid", UUID_MAX)).eval(UUID_FILE);
+ assertThat(allMatch).as("Not all UUIDs are less than upper bound").isFalse();
+
+ // Query: uuid < aboveMax (all UUIDs in file should be < this)
+ UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
+ allMatch = new StrictMetricsEvaluator(SCHEMA, lessThan("uuid", aboveMax)).eval(UUID_FILE);
+ assertThat(allMatch).as("All UUIDs should be less than aboveMax").isTrue();
+ }
+
+ @Test
+ public void testStrictUuidEqNeverMatchesRange() {
+ // Strict eq should never match when there's a range of values
+ UUID middle = UUID.fromString("40000000-0000-0000-0000-000000000001");
+ boolean allMatch = new StrictMetricsEvaluator(SCHEMA, equal("uuid", middle)).eval(UUID_FILE);
+ assertThat(allMatch).as("Strict eq should not match range").isFalse();
+ }
+
+ @Test
+ public void testStrictUuidInNeverMatchesRange() {
+ // Strict IN should never match when there's a range of values (lower != upper)
+ UUID middle1 = UUID.fromString("40000000-0000-0000-0000-000000000001");
+ UUID middle2 = UUID.fromString("50000000-0000-0000-0000-000000000001");
+ boolean allMatch =
+ new StrictMetricsEvaluator(SCHEMA, in("uuid", middle1, middle2)).eval(UUID_FILE);
+ assertThat(allMatch).as("Strict IN should not match range").isFalse();
+ }
+
+ @Test
+ public void testStrictUuidInMatchesSingleValue() {
+ // Strict IN should match when lower == upper and the value is in the set
+ boolean allMatch =
+ new StrictMetricsEvaluator(SCHEMA, in("uuid", SINGLE_UUID)).eval(SINGLE_UUID_FILE);
+ assertThat(allMatch).as("Strict IN should match single value in set").isTrue();
+ }
+
+ @Test
+ public void testStrictUuidInDoesNotMatchWhenValueNotInSet() {
+ // Strict IN should not match when lower == upper but the value is not in the set
+ UUID otherUuid = UUID.fromString("50000000-0000-0000-0000-000000000001");
+ boolean allMatch =
+ new StrictMetricsEvaluator(SCHEMA, in("uuid", otherUuid)).eval(SINGLE_UUID_FILE);
+ assertThat(allMatch).as("Strict IN should not match when value not in set").isFalse();
+ }
+
+ @Test
+ public void testStrictUuidNotInMatchesWhenAllValuesOutsideBounds() {
+ // Strict NOT IN should match when all values in the set are outside the file's bounds
+ UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
+ boolean allMatch =
+ new StrictMetricsEvaluator(SCHEMA, notIn("uuid", belowMin, aboveMax)).eval(UUID_FILE);
+ // All values in the set are outside [UUID_MIN, UUID_MAX], so all rows match NOT IN
+ assertThat(allMatch).as("Strict NOT IN should match when all values outside bounds").isTrue();
+ }
+
+ @Test
+ public void testStrictUuidNotInDoesNotMatchWhenValueInBounds() {
+ // Strict NOT IN should not match when a value in the set is within bounds
+ UUID middle = UUID.fromString("40000000-0000-0000-0000-000000000001");
+ boolean allMatch = new StrictMetricsEvaluator(SCHEMA, notIn("uuid", middle)).eval(UUID_FILE);
+ // middle is within [UUID_MIN, UUID_MAX], so some rows might match the value
+ assertThat(allMatch).as("Strict NOT IN should not match when value in bounds").isFalse();
+ }
+
+ // Tests for file with inverted UUID bounds (as would be written by legacy signed comparator)
+
+ @Test
+ public void testStrictUuidInWithLegacyInvertedBounds() {
+ // With inverted bounds [0x80..., 0x40...] where lower > upper in RFC order,
+ // strict IN should never match since lower != upper
+ UUID uuid1 = UUID.fromString("20000000-0000-0000-0000-000000000001");
+ UUID uuid2 = UUID.fromString("30000000-0000-0000-0000-000000000001");
+ boolean allMatch =
+ new StrictMetricsEvaluator(SCHEMA, in("uuid", uuid1, uuid2)).eval(LEGACY_UUID_FILE);
+ assertThat(allMatch).as("Strict IN should not match when lower != upper").isFalse();
+ }
+
+ @Test
+ public void testStrictUuidNotInWithLegacyInvertedBounds() {
+ // Query: NOT IN (0x50..., 0x60...)
+ // With inverted bounds [0x80..., 0x40...], in RFC order: 0x40... < 0x50... < 0x60... < 0x80...
+ // The values 0x50... and 0x60... are between upper (0x40...) and lower (0x80...) in RFC order
+ // Since bounds are inverted, the evaluator sees lower (0x80...) > values, so they appear
+ // below the lower bound, and strict NOT IN returns true
+ UUID uuid1 = UUID.fromString("50000000-0000-0000-0000-000000000001");
+ UUID uuid2 = UUID.fromString("60000000-0000-0000-0000-000000000001");
+ boolean allMatch =
+ new StrictMetricsEvaluator(SCHEMA, notIn("uuid", uuid1, uuid2)).eval(LEGACY_UUID_FILE);
+ // With RFC-only comparison, values appear below lower bound (0x80...), so all rows match NOT IN
+ assertThat(allMatch)
+ .as("Strict NOT IN should match when all values below lower bound in RFC order")
+ .isTrue();
+ }
}
diff --git a/api/src/test/java/org/apache/iceberg/types/TestComparators.java b/api/src/test/java/org/apache/iceberg/types/TestComparators.java
index 691e3f04a074..7127050d60fc 100644
--- a/api/src/test/java/org/apache/iceberg/types/TestComparators.java
+++ b/api/src/test/java/org/apache/iceberg/types/TestComparators.java
@@ -100,6 +100,76 @@ public void testUuid() {
Comparators.forType(Types.UUIDType.get()),
UUID.fromString("81873e7d-1374-4493-8e1d-9095eff7046c"),
UUID.fromString("fd02441d-1423-4a3f-8785-c7dd5647e26b"));
+ assertComparesCorrectly(
+ Comparators.forType(Types.UUIDType.get()),
+ UUID.fromString("00000000-0000-0000-0000-000000000000"),
+ UUID.fromString("60000000-0000-0000-0000-000000000000"));
+ assertComparesCorrectly(
+ Comparators.forType(Types.UUIDType.get()),
+ UUID.fromString("60000000-0000-0000-0000-000000000000"),
+ UUID.fromString("70000000-0000-0000-0000-000000000000"));
+ // The following assertions fail prior to the introduction of unsigned UUID comparator.
+ assertComparesCorrectly(
+ Comparators.forType(Types.UUIDType.get()),
+ UUID.fromString("70000000-0000-0000-0000-000000000000"),
+ UUID.fromString("80000000-0000-0000-0000-000000000000"));
+ assertComparesCorrectly(
+ Comparators.forType(Types.UUIDType.get()),
+ UUID.fromString("80000000-0000-0000-0000-000000000000"),
+ UUID.fromString("90000000-0000-0000-0000-000000000000"));
+ assertComparesCorrectly(
+ Comparators.forType(Types.UUIDType.get()),
+ UUID.fromString("90000000-0000-0000-0000-000000000000"),
+ UUID.fromString("a0000000-0000-0000-0000-000000000000"));
+ assertComparesCorrectly(
+ Comparators.forType(Types.UUIDType.get()),
+ UUID.fromString("a0000000-0000-0000-0000-000000000000"),
+ UUID.fromString("f0000000-0000-0000-0000-000000000000"));
+ assertComparesCorrectly(
+ Comparators.forType(Types.UUIDType.get()),
+ UUID.fromString("ffffffff-ffff-ffff-ffff-fffffffffffe"),
+ UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"));
+ }
+
+ @Test
+ public void testSignedUuid() {
+ // Test the signed UUID comparator used for backward compatibility with legacy files.
+ // In signed comparison, UUIDs with high bit set (0x80...) are treated as negative,
+ // so they compare less than UUIDs without high bit set (0x00-0x7F...).
+ Comparator signedCmp = Comparators.signedUUIDs();
+
+ // Within the "positive" range (0x00-0x7F), ordering is the same as unsigned
+ assertComparesCorrectly(
+ signedCmp,
+ UUID.fromString("00000000-0000-0000-0000-000000000000"),
+ UUID.fromString("40000000-0000-0000-0000-000000000000"));
+ assertComparesCorrectly(
+ signedCmp,
+ UUID.fromString("40000000-0000-0000-0000-000000000000"),
+ UUID.fromString("70000000-0000-0000-0000-000000000000"));
+
+ // Within the "negative" range (0x80-0xFF), ordering is the same as unsigned
+ assertComparesCorrectly(
+ signedCmp,
+ UUID.fromString("80000000-0000-0000-0000-000000000000"),
+ UUID.fromString("90000000-0000-0000-0000-000000000000"));
+ assertComparesCorrectly(
+ signedCmp,
+ UUID.fromString("c0000000-0000-0000-0000-000000000000"),
+ UUID.fromString("f0000000-0000-0000-0000-000000000000"));
+
+ // In signed comparison, 0x80... < 0x70... because the high bit is treated as negative.
+ // In unsigned comparison, 0x70... < 0x80... because bytes are compared as unsigned values.
+ UUID highBit = UUID.fromString("80000000-0000-0000-0000-000000000000");
+ UUID lowBit = UUID.fromString("70000000-0000-0000-0000-000000000000");
+ assertThat(signedCmp.compare(highBit, lowBit))
+ .as("In signed comparison, 0x80... should be less than 0x70...")
+ .isLessThan(0);
+
+ Comparator unsignedCmp = Comparators.forType(Types.UUIDType.get());
+ assertThat(unsignedCmp.compare(highBit, lowBit))
+ .as("In unsigned comparison, 0x80... should be greater than 0x70...")
+ .isGreaterThan(0);
}
@Test
diff --git a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java
index 3cb46b309d82..4c0b6b4d3844 100644
--- a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java
+++ b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java
@@ -41,10 +41,13 @@
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import static org.assertj.core.api.Assumptions.assumeThat;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
import java.io.File;
import java.io.IOException;
import java.io.UncheckedIOException;
+import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.List;
import java.util.UUID;
@@ -69,21 +72,27 @@
import org.apache.iceberg.orc.ORC;
import org.apache.iceberg.parquet.Parquet;
import org.apache.iceberg.parquet.ParquetMetricsRowGroupFilter;
+import org.apache.iceberg.parquet.ParquetSchemaUtil;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.types.Types.DoubleType;
import org.apache.iceberg.types.Types.FloatType;
import org.apache.iceberg.types.Types.IntegerType;
import org.apache.iceberg.types.Types.StringType;
+import org.apache.iceberg.util.UUIDUtil;
import org.apache.iceberg.variants.ShreddedObject;
import org.apache.iceberg.variants.Variant;
import org.apache.iceberg.variants.VariantMetadata;
import org.apache.iceberg.variants.Variants;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
+import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
+import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
+import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.io.DelegatingSeekableInputStream;
+import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.MessageType;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.TestTemplate;
@@ -162,6 +171,17 @@ public static List