diff --git a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java index 9bb2b713439d..80dd4466f629 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java @@ -24,12 +24,14 @@ import java.time.temporal.ChronoUnit; import java.util.List; import java.util.Locale; +import java.util.UUID; import java.util.concurrent.TimeUnit; import java.util.function.Function; import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.StreamSupport; +import javax.annotation.Nullable; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.Table; @@ -744,4 +746,105 @@ private static PartitionSpec identitySpec(Schema schema, int... ids) { return specBuilder.build(); } + + /** + * Transform UUID literals in an unbound expression to use signed comparators, if the expression + * contains UUID bounds predicates. This maintains backward compatibility with files written + * before RFC 4122/9562 compliant comparison was implemented. + * + *

The transformed expression contains literals with signed comparators for lt/gt/eq + * predicates. For IN predicates, the comparator information is lost when binding converts + * literals to a Set of raw values, so the evaluator must handle this separately. + * + * @param expr an unbound expression + * @return the expression with signed UUID comparators, or null if no UUID predicates are present + */ + @Nullable + public static Expression toSignedUUIDLiteral(Expression expr) { + SignedUUIDLiteralVisitor visitor = new SignedUUIDLiteralVisitor(); + Expression transformed = ExpressionVisitors.visit(expr, visitor); + return visitor.foundUUIDBoundsPredicate() ? transformed : null; + } + + /** + * Visitor that transforms an expression to use the signed UUID comparator in all UUID literals, + * while also tracking whether any UUID bounds predicates were found. + */ + private static class SignedUUIDLiteralVisitor + extends ExpressionVisitors.ExpressionVisitor { + + private boolean foundUUIDBoundsPredicate = false; + + boolean foundUUIDBoundsPredicate() { + return foundUUIDBoundsPredicate; + } + + @Override + public Expression alwaysTrue() { + return Expressions.alwaysTrue(); + } + + @Override + public Expression alwaysFalse() { + return Expressions.alwaysFalse(); + } + + @Override + public Expression not(Expression result) { + return Expressions.not(result); + } + + @Override + public Expression and(Expression leftResult, Expression rightResult) { + return Expressions.and(leftResult, rightResult); + } + + @Override + public Expression or(Expression leftResult, Expression rightResult) { + return Expressions.or(leftResult, rightResult); + } + + @Override + public Expression predicate(BoundPredicate pred) { + // Bound predicates should not be transformed - this is for unbound expressions + throw new UnsupportedOperationException( + "Cannot transform bound predicate; use unbound expressions"); + } + + @Override + @SuppressWarnings("unchecked") + public Expression predicate(UnboundPredicate pred) { + switch (pred.op()) { + case LT: + case LT_EQ: + case GT: + case GT_EQ: + case EQ: + case NOT_EQ: + if (pred.literal().value() instanceof UUID) { + foundUUIDBoundsPredicate = true; + Literals.UUIDLiteral uuidLit = (Literals.UUIDLiteral) pred.literal(); + return new UnboundPredicate<>( + pred.op(), pred.term(), (T) uuidLit.withSignedComparator()); + } + return pred; + + case IN: + case NOT_IN: + List> literals = pred.literals(); + if (!literals.isEmpty() && literals.get(0).value() instanceof UUID) { + foundUUIDBoundsPredicate = true; + List transformedValues = + literals.stream() + .map(l -> (T) ((Literals.UUIDLiteral) l).withSignedComparator()) + .collect(Collectors.toList()); + return new UnboundPredicate<>(pred.op(), pred.term(), transformedValues); + } + return pred; + + default: + return pred; + } + } + } } diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java index 81cbbe785519..5a6c52df4e49 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java @@ -56,6 +56,10 @@ public class InclusiveMetricsEvaluator { private static final int IN_PREDICATE_LIMIT = 200; private final Expression expr; + // Expression using signed UUID comparator for backward compatibility with files written + // prior to the introduction of RFC-compliant UUID comparisons. + // Null if there are no UUID predicates comparing against bounds in the expression. + private final Expression signedUuidExpr; public InclusiveMetricsEvaluator(Schema schema, Expression unbound) { this(schema, unbound, true); @@ -63,7 +67,14 @@ public InclusiveMetricsEvaluator(Schema schema, Expression unbound) { public InclusiveMetricsEvaluator(Schema schema, Expression unbound, boolean caseSensitive) { StructType struct = schema.asStruct(); - this.expr = Binder.bind(struct, rewriteNot(unbound), caseSensitive); + Expression rewritten = rewriteNot(unbound); + this.expr = Binder.bind(struct, rewritten, caseSensitive); + + // Create the signed UUID expression if and only if there are UUID predicates + // that compare against bounds. + Expression transformed = ExpressionUtil.toSignedUUIDLiteral(rewritten); + this.signedUuidExpr = + transformed != null ? Binder.bind(struct, transformed, caseSensitive) : null; } /** @@ -74,20 +85,34 @@ public InclusiveMetricsEvaluator(Schema schema, Expression unbound, boolean case */ public boolean eval(ContentFile file) { // TODO: detect the case where a column is missing from the file using file's max field id. - return new MetricsEvalVisitor().eval(file); + boolean result = new MetricsEvalVisitor().eval(file, expr, false); + + // If the RFC-compliant evaluation says rows might match, or there's no signed UUID expression, + // return the result. + if (result || signedUuidExpr == null) { + return result; + } + + // Always try with signed UUID comparator as a fallback. There is no reliable way to detect + // whether signed or unsigned comparator was used when the UUID column stats were written. + return new MetricsEvalVisitor().eval(file, signedUuidExpr, true); } private static final boolean ROWS_MIGHT_MATCH = true; private static final boolean ROWS_CANNOT_MATCH = false; - private class MetricsEvalVisitor extends ExpressionVisitors.BoundVisitor { + private static class MetricsEvalVisitor extends ExpressionVisitors.BoundVisitor { private Map valueCounts = null; private Map nullCounts = null; private Map nanCounts = null; private Map lowerBounds = null; private Map upperBounds = null; + // Flag to use signed UUID comparator for backward compatibility. + // This is needed for the IN predicate because the comparator information is lost + // when binding converts literals to a Set of raw values. + private boolean useSignedUuidComparator = false; - private boolean eval(ContentFile file) { + private boolean eval(ContentFile file, Expression expression, boolean signedUuidMode) { if (file.recordCount() == 0) { return ROWS_CANNOT_MATCH; } @@ -104,8 +129,9 @@ private boolean eval(ContentFile file) { this.nanCounts = file.nanValueCounts(); this.lowerBounds = file.lowerBounds(); this.upperBounds = file.upperBounds(); + this.useSignedUuidComparator = signedUuidMode; - return ExpressionVisitors.visitEvaluator(expr, this); + return ExpressionVisitors.visitEvaluator(expression, this); } @Override @@ -359,10 +385,12 @@ public Boolean in(Bound term, Set literalSet) { return ROWS_MIGHT_MATCH; } + // use an appropriate comparator, for UUIDs use the signed comparator if requested + Comparator cmp = + Comparators.comparatorFor( + term.ref().type(), ((BoundTerm) term).comparator(), useSignedUuidComparator); literals = - literals.stream() - .filter(v -> ((BoundTerm) term).comparator().compare(lower, v) <= 0) - .collect(Collectors.toList()); + literals.stream().filter(v -> cmp.compare(lower, v) <= 0).collect(Collectors.toList()); // if all values are less than lower bound, rows cannot match if (literals.isEmpty()) { return ROWS_CANNOT_MATCH; @@ -374,9 +402,7 @@ public Boolean in(Bound term, Set literalSet) { } literals = - literals.stream() - .filter(v -> ((BoundTerm) term).comparator().compare(upper, v) >= 0) - .collect(Collectors.toList()); + literals.stream().filter(v -> cmp.compare(upper, v) >= 0).collect(Collectors.toList()); // if remaining values are greater than upper bound, rows cannot match if (literals.isEmpty()) { return ROWS_CANNOT_MATCH; diff --git a/api/src/main/java/org/apache/iceberg/expressions/Literals.java b/api/src/main/java/org/apache/iceberg/expressions/Literals.java index 3a45eb804f35..a1d8a39772bd 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Literals.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Literals.java @@ -608,9 +608,25 @@ public String toString() { } } - static class UUIDLiteral extends ComparableLiteral { + static class UUIDLiteral extends BaseLiteral { + // unsigned byte-wise comparator per RFC 4122/9562 specification + private static final Comparator UNSIGNED_CMP = + Comparators.nullsFirst().thenComparing(Comparators.uuids()); + // signed comparator for backward compatibility with files written before the introduction of + // RFC-compliant comparison + private static final Comparator SIGNED_CMP = + Comparators.nullsFirst().thenComparing(Comparators.signedUUIDs()); + + // Flag to indicate which comparator to use (serializable) + private final boolean useSignedComparator; + UUIDLiteral(UUID value) { + this(value, false); + } + + UUIDLiteral(UUID value, boolean useSignedComparator) { super(value); + this.useSignedComparator = useSignedComparator; } @Override @@ -622,10 +638,23 @@ public Literal to(Type type) { return null; } + @Override + public Comparator comparator() { + return useSignedComparator ? SIGNED_CMP : UNSIGNED_CMP; + } + @Override protected Type.TypeID typeId() { return Type.TypeID.UUID; } + + /** + * Creates a new UUIDLiteral with the signed comparator for backward compatibility with files + * written before RFC-compliant UUID comparisons were introduced. + */ + UUIDLiteral withSignedComparator() { + return new UUIDLiteral(value(), true); + } } static class FixedLiteral extends BaseLiteral { diff --git a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java index 21762946ca33..1a213e7a666d 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java @@ -34,6 +34,7 @@ import org.apache.iceberg.types.Comparators; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; import org.apache.iceberg.util.BinaryUtil; /** @@ -51,6 +52,9 @@ public class ManifestEvaluator { private static final int IN_PREDICATE_LIMIT = 200; private final Expression expr; + // Expression using signed UUID comparator for backward compatibility with files written before + // RFC-compliant UUID comparisons were introduced. Null if no UUID predicates. + private final Expression signedUuidExpr; public static ManifestEvaluator forRowFilter( Expression rowFilter, PartitionSpec spec, boolean caseSensitive) { @@ -64,7 +68,15 @@ public static ManifestEvaluator forPartitionFilter( } private ManifestEvaluator(PartitionSpec spec, Expression partitionFilter, boolean caseSensitive) { - this.expr = Binder.bind(spec.partitionType(), rewriteNot(partitionFilter), caseSensitive); + Types.StructType partitionType = spec.partitionType(); + Expression rewritten = rewriteNot(partitionFilter); + this.expr = Binder.bind(partitionType, rewritten, caseSensitive); + + // Create the signed UUID expression if and only if there are UUID predicates + // that compare against bounds. + Expression transformed = ExpressionUtil.toSignedUUIDLiteral(rewritten); + this.signedUuidExpr = + transformed != null ? Binder.bind(partitionType, transformed, caseSensitive) : null; } /** @@ -74,22 +86,37 @@ private ManifestEvaluator(PartitionSpec spec, Expression partitionFilter, boolea * @return false if the file cannot contain rows that match the expression, true otherwise. */ public boolean eval(ManifestFile manifest) { - return new ManifestEvalVisitor().eval(manifest); + boolean result = new ManifestEvalVisitor().eval(manifest, expr, false); + + // If the RFC-compliant evaluation says rows might match, or there's no signed UUID expression, + // return the result. + if (result || signedUuidExpr == null) { + return result; + } + + // Always try with signed UUID comparator as a fallback. There is no reliable way to detect + // whether signed or unsigned comparator was used when the UUID column stats were written. + return new ManifestEvalVisitor().eval(manifest, signedUuidExpr, true); } private static final boolean ROWS_MIGHT_MATCH = true; private static final boolean ROWS_CANNOT_MATCH = false; - private class ManifestEvalVisitor extends BoundExpressionVisitor { + private static class ManifestEvalVisitor extends BoundExpressionVisitor { private List stats = null; + // Flag to use signed UUID comparator for backward compatibility. + // This is specifically necessary for IN predicates because the comparator information + // is lost when binding converts literals to a Set of raw values. + private boolean useSignedUuidComparator = false; - private boolean eval(ManifestFile manifest) { + private boolean eval(ManifestFile manifest, Expression expression, boolean signedUuidMode) { this.stats = manifest.partitions(); + this.useSignedUuidComparator = signedUuidMode; if (stats == null) { return ROWS_MIGHT_MATCH; } - return ExpressionVisitors.visitEvaluator(expr, this); + return ExpressionVisitors.visitEvaluator(expression, this); } @Override @@ -295,20 +322,19 @@ public Boolean in(BoundReference ref, Set literalSet) { return ROWS_MIGHT_MATCH; } + // use an appropriate comparator, for UUIDs use the signed comparator if requested + Comparator cmp = + Comparators.comparatorFor(ref.type(), ref.comparator(), useSignedUuidComparator); T lower = Conversions.fromByteBuffer(ref.type(), fieldStats.lowerBound()); literals = - literals.stream() - .filter(v -> ref.comparator().compare(lower, v) <= 0) - .collect(Collectors.toList()); + literals.stream().filter(v -> cmp.compare(lower, v) <= 0).collect(Collectors.toList()); if (literals.isEmpty()) { // if all values are less than lower bound, rows cannot match. return ROWS_CANNOT_MATCH; } T upper = Conversions.fromByteBuffer(ref.type(), fieldStats.upperBound()); literals = - literals.stream() - .filter(v -> ref.comparator().compare(upper, v) >= 0) - .collect(Collectors.toList()); + literals.stream().filter(v -> cmp.compare(upper, v) >= 0).collect(Collectors.toList()); if (literals .isEmpty()) { // if all remaining values are greater than upper bound, rows cannot match. return ROWS_CANNOT_MATCH; diff --git a/api/src/main/java/org/apache/iceberg/types/Comparators.java b/api/src/main/java/org/apache/iceberg/types/Comparators.java index ab59c895686d..a1821eee7332 100644 --- a/api/src/main/java/org/apache/iceberg/types/Comparators.java +++ b/api/src/main/java/org/apache/iceberg/types/Comparators.java @@ -22,6 +22,7 @@ import java.util.Comparator; import java.util.List; import java.util.Map; +import java.util.UUID; import java.util.function.IntFunction; import org.apache.iceberg.StructLike; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; @@ -46,7 +47,7 @@ private Comparators() {} .put(Types.TimestampNanoType.withZone(), Comparator.naturalOrder()) .put(Types.TimestampNanoType.withoutZone(), Comparator.naturalOrder()) .put(Types.StringType.get(), Comparators.charSequences()) - .put(Types.UUIDType.get(), Comparator.naturalOrder()) + .put(Types.UUIDType.get(), Comparators.uuids()) .put(Types.BinaryType.get(), Comparators.unsignedBytes()) .put(Types.UnknownType.get(), Comparator.nullsFirst(Comparator.naturalOrder())) .buildOrThrow(); @@ -233,6 +234,42 @@ public static Comparator filePath() { return FilePathComparator.INSTANCE; } + public static Comparator uuids() { + return UUIDComparator.INSTANCE; + } + + /** + * Returns a comparator that uses Java's native signed UUID comparison. + * + *

This is provided for backward compatibility with older files that may have been written with + * min/max statistics computed using Java's signed {@link UUID#compareTo(UUID)}. New code should + * use {@link #uuids()} which provides RFC 4122/9562 compliant unsigned comparison. + * + * @return a comparator using signed UUID comparison + */ + public static Comparator signedUUIDs() { + return Comparator.naturalOrder(); + } + + /** + * Returns the given comparator for a type or the signed UUID comparator, conditionally. + * + * @param type the Iceberg type + * @param defaultComparator the default comparator to use for non-UUID types or when signed UUID + * comparison is not requested + * @param useSignedUuid if true and the type is UUID, returns a signed UUID comparator + * @return the appropriate comparator + */ + @SuppressWarnings("unchecked") + public static Comparator comparatorFor( + Type type, Comparator defaultComparator, boolean useSignedUuid) { + if (useSignedUuid && type.typeId() == Type.TypeID.UUID) { + return (Comparator) signedUUIDs(); + } + + return defaultComparator; + } + private static class NullsFirst implements Comparator { private static final NullsFirst INSTANCE = new NullsFirst<>(); @@ -448,4 +485,40 @@ public int compare(CharSequence s1, CharSequence s2) { return 0; } } + + /** + * Compares UUIDs using unsigned byte-wise comparison using big-endian byte-order compliant with + * RFC 4122 and RFC 9562. Java's UUID.compareTo() compares most significant bits first, then least + * significant bits using signed value comparisons, which is a known bug. + */ + private static class UUIDComparator implements Comparator { + private static final UUIDComparator INSTANCE = new UUIDComparator(); + + private UUIDComparator() {} + + @Override + public int compare(UUID uuid1, UUID uuid2) { + if (uuid1 == uuid2) { + return 0; + } + + // Compare most significant bits first (bytes 0-7 in big-endian representation) + long msb1 = uuid1.getMostSignificantBits(); + long msb2 = uuid2.getMostSignificantBits(); + + // Use unsigned comparison for the most significant bits + int msbCompare = Long.compareUnsigned(msb1, msb2); + if (msbCompare != 0) { + return msbCompare; + } + + // If most significant bits are equal, compare least significant bits (bytes 8-15) + long lsb1 = uuid1.getLeastSignificantBits(); + long lsb2 = uuid2.getLeastSignificantBits(); + + // Use unsigned comparison for the least significant bits + return Long.compareUnsigned(lsb1, lsb2); + } + } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java index d9fe26eacc6b..6373e73a12ec 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java @@ -29,6 +29,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.UUID; import java.util.regex.Pattern; import java.util.stream.IntStream; import org.apache.iceberg.PartitionSpec; @@ -44,6 +45,7 @@ import org.apache.iceberg.variants.VariantPrimitive; import org.apache.iceberg.variants.VariantTestUtil; import org.apache.iceberg.variants.VariantValue; +import org.assertj.core.api.InstanceOfAssertFactories; import org.junit.jupiter.api.Test; public class TestExpressionUtil { @@ -1350,4 +1352,168 @@ private void assertEquals(UnboundTransform expected, UnboundTransform originalPredicate = (UnboundPredicate) original; + assertThat(originalPredicate.literal()) + .isInstanceOf(Literals.UUIDLiteral.class) + .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class)) + .extracting("useSignedComparator") + .isEqualTo(false); + + Expression result = ExpressionUtil.toSignedUUIDLiteral(original); + assertThat(result).isNotNull().isInstanceOf(UnboundPredicate.class); + UnboundPredicate predicate = (UnboundPredicate) result; + assertThat(predicate.literal().value()).isEqualTo(testUuid); + // The literal should now use the signed comparator + assertThat(predicate.literal()) + .isInstanceOf(Literals.UUIDLiteral.class) + .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class)) + .extracting("useSignedComparator") + .isEqualTo(true); + } + + @Test + public void testToSignedUUIDLiteralTransformsLtPredicate() { + UUID testUuid = UUID.randomUUID(); + Expression original = Expressions.lessThan("uuid_col", testUuid); + + // Verify the original literal uses unsigned comparator (useSignedComparator = false) + UnboundPredicate originalPredicate = (UnboundPredicate) original; + assertThat(originalPredicate.literal()) + .isInstanceOf(Literals.UUIDLiteral.class) + .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class)) + .extracting("useSignedComparator") + .isEqualTo(false); + + Expression result = ExpressionUtil.toSignedUUIDLiteral(original); + assertThat(result).isNotNull().isInstanceOf(UnboundPredicate.class); + UnboundPredicate predicate = (UnboundPredicate) result; + assertThat(predicate.literal()) + .isInstanceOf(Literals.UUIDLiteral.class) + .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class)) + .extracting("useSignedComparator") + .isEqualTo(true); + } + + @Test + public void testToSignedUUIDLiteralTransformsGtPredicate() { + UUID testUuid = UUID.randomUUID(); + Expression original = Expressions.greaterThan("uuid_col", testUuid); + + // Verify the original literal uses unsigned comparator (useSignedComparator = false) + UnboundPredicate originalPredicate = (UnboundPredicate) original; + assertThat(originalPredicate.literal()) + .isInstanceOf(Literals.UUIDLiteral.class) + .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class)) + .extracting("useSignedComparator") + .isEqualTo(false); + + Expression result = ExpressionUtil.toSignedUUIDLiteral(original); + assertThat(result).isNotNull().isInstanceOf(UnboundPredicate.class); + UnboundPredicate predicate = (UnboundPredicate) result; + assertThat(predicate.literal()) + .isInstanceOf(Literals.UUIDLiteral.class) + .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class)) + .extracting("useSignedComparator") + .isEqualTo(true); + } + + @Test + public void testToSignedUUIDLiteralTransformsInPredicate() { + UUID uuid1 = UUID.randomUUID(); + UUID uuid2 = UUID.randomUUID(); + Expression original = Expressions.in("uuid_col", uuid1, uuid2); + + // Verify the original literals use unsigned comparator (useSignedComparator = false) + UnboundPredicate originalPredicate = (UnboundPredicate) original; + originalPredicate + .literals() + .forEach( + lit -> { + assertThat(lit) + .isInstanceOf(Literals.UUIDLiteral.class) + .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class)) + .extracting("useSignedComparator") + .isEqualTo(false); + }); + + Expression result = ExpressionUtil.toSignedUUIDLiteral(original); + assertThat(result).isNotNull().isInstanceOf(UnboundPredicate.class); + UnboundPredicate predicate = (UnboundPredicate) result; + assertThat(predicate.literals()).hasSize(2); + // All literals should use the signed comparator + predicate + .literals() + .forEach( + lit -> { + assertThat(lit) + .isInstanceOf(Literals.UUIDLiteral.class) + .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class)) + .extracting("useSignedComparator") + .isEqualTo(true); + }); + } + + @Test + public void testToSignedUUIDLiteralTransformsCompoundExpression() { + UUID testUuid = UUID.randomUUID(); + Expression original = + Expressions.and( + Expressions.equal("id", 42L), Expressions.greaterThan("uuid_col", testUuid)); + + // Verify the original UUID literal uses unsigned comparator (useSignedComparator = false) + And originalAnd = (And) original; + UnboundPredicate originalUuidPredicate = (UnboundPredicate) originalAnd.right(); + assertThat(originalUuidPredicate.literal()) + .isInstanceOf(Literals.UUIDLiteral.class) + .asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class)) + .extracting("useSignedComparator") + .isEqualTo(false); + + Expression result = ExpressionUtil.toSignedUUIDLiteral(original); + assertThat(result).isNotNull().isInstanceOf(And.class); + And and = (And) result; + + // Verify the non-UUID predicate is unchanged + assertThat(and.left()).isInstanceOf(UnboundPredicate.class); + UnboundPredicate leftPredicate = (UnboundPredicate) and.left(); + assertThat(leftPredicate.literal()) + .isInstanceOf(Literals.LongLiteral.class) + .extracting("value") + .isEqualTo(42L); + + // Verify the UUID predicate is transformed with signed comparator + assertThat(and.right()).isInstanceOf(UnboundPredicate.class); + UnboundPredicate rightPredicate = (UnboundPredicate) and.right(); + assertThat(rightPredicate.literal()) + .isInstanceOf(Literals.UUIDLiteral.class) + .extracting("useSignedComparator") + .isEqualTo(true); + } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java index 78e6064eb427..f83b87d61aae 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java @@ -42,6 +42,7 @@ import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.nio.ByteBuffer; +import java.util.UUID; import org.apache.iceberg.ManifestFile; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; @@ -69,7 +70,8 @@ public class TestInclusiveManifestEvaluator { optional(15, "no_nulls_same_value_a", Types.StringType.get()), optional(16, "single_value_with_nan", Types.FloatType.get()), optional(17, "single_value_nan_unknown", Types.FloatType.get()), - optional(18, "single_value_no_nan", Types.FloatType.get())); + optional(18, "single_value_no_nan", Types.FloatType.get()), + optional(19, "uuid", Types.UUIDType.get())); private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) @@ -90,6 +92,7 @@ public class TestInclusiveManifestEvaluator { .identity("single_value_with_nan") .identity("single_value_nan_unknown") .identity("single_value_no_nan") + .identity("uuid") .build(); private static final int INT_MIN_VALUE = 30; @@ -101,6 +104,75 @@ public class TestInclusiveManifestEvaluator { private static final ByteBuffer STRING_MIN = toByteBuffer(Types.StringType.get(), "a"); private static final ByteBuffer STRING_MAX = toByteBuffer(Types.StringType.get(), "z"); + // UUID_MIN has all zeros in MSB, all ones in LSB: 00000000-0000-0000-ffff-ffffffffffff + // UUID_MAX has all ones in MSB, all zeros in LSB: ffffffff-ffff-ffff-0000-000000000000 + // With unsigned byte-wise comparison (correct): UUID_MIN < UUID_MAX (0x00... < 0xFF...) + // With Java's natural order (incorrect): UUID_MIN > UUID_MAX (MSB 0 > MSB -1 as signed long) + private static final UUID UUID_MIN_VALUE = + UUID.fromString("00000000-0000-0000-ffff-ffffffffffff"); + private static final UUID UUID_MAX_VALUE = + UUID.fromString("ffffffff-ffff-ffff-0000-000000000000"); + + private static final ByteBuffer UUID_MIN = toByteBuffer(Types.UUIDType.get(), UUID_MIN_VALUE); + private static final ByteBuffer UUID_MAX = toByteBuffer(Types.UUIDType.get(), UUID_MAX_VALUE); + + // Legacy manifest with "inverted" bounds (min > max in unsigned order) + // Used for testing backward compatibility with files written using signed UUID comparator + private static final UUID LEGACY_UUID_MIN = + UUID.fromString("80000000-0000-0000-0000-000000000001"); + private static final UUID LEGACY_UUID_MAX = + UUID.fromString("40000000-0000-0000-0000-000000000001"); + + private static final ManifestFile LEGACY_UUID_MANIFEST = + new TestHelpers.TestManifestFile( + "legacy-uuid-manifest.avro", + 1024, + 0, + System.currentTimeMillis(), + 5, + 10, + 0, + ImmutableList.of( + new TestHelpers.TestFieldSummary(false, INT_MIN, INT_MAX), + new TestHelpers.TestFieldSummary(true, null, null), + new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MAX), + new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MAX), + new TestHelpers.TestFieldSummary( + false, + toByteBuffer(Types.FloatType.get(), 0F), + toByteBuffer(Types.FloatType.get(), 20F)), + new TestHelpers.TestFieldSummary(true, null, null), + new TestHelpers.TestFieldSummary(true, false, null, null), + new TestHelpers.TestFieldSummary(false, true, null, null), + new TestHelpers.TestFieldSummary(true, true, null, null), + new TestHelpers.TestFieldSummary( + false, + false, + toByteBuffer(Types.FloatType.get(), 0F), + toByteBuffer(Types.FloatType.get(), 20F)), + new TestHelpers.TestFieldSummary(true, null, null), + new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MIN), + new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN), + new TestHelpers.TestFieldSummary( + false, + true, + toByteBuffer(Types.FloatType.get(), 5.0F), + toByteBuffer(Types.FloatType.get(), 5.0F)), + new TestHelpers.TestFieldSummary( + false, + toByteBuffer(Types.FloatType.get(), 5.0F), + toByteBuffer(Types.FloatType.get(), 5.0F)), + new TestHelpers.TestFieldSummary( + false, + false, + toByteBuffer(Types.FloatType.get(), 5.0F), + toByteBuffer(Types.FloatType.get(), 5.0F)), + new TestHelpers.TestFieldSummary( + false, + toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_MIN), + toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_MAX))), + null); + private static final ManifestFile NO_STATS = new TestHelpers.TestManifestFile( "manifest-list.avro", 1024, 0, System.currentTimeMillis(), null, null, null, null, null); @@ -148,7 +220,8 @@ public class TestInclusiveManifestEvaluator { false, false, toByteBuffer(Types.FloatType.get(), 5.0F), - toByteBuffer(Types.FloatType.get(), 5.0F))), + toByteBuffer(Types.FloatType.get(), 5.0F)), + new TestHelpers.TestFieldSummary(false, UUID_MIN, UUID_MAX)), null); @Test @@ -853,4 +926,264 @@ public void testNotInWithSingleValue() { .as("Should not read: manifest contains single float value with no NaNs") .isFalse(); } + + @Test + public void testUuidEq() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(equal("uuid", belowMin), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid below lower bound").isFalse(); + + shouldRead = + ManifestEvaluator.forRowFilter(equal("uuid", UUID_MIN_VALUE), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to lower bound").isTrue(); + + UUID between = UUID.fromString("7fffffff-ffff-ffff-7fff-ffffffffffff"); + shouldRead = ManifestEvaluator.forRowFilter(equal("uuid", between), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(equal("uuid", UUID_MAX_VALUE), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to upper bound").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = ManifestEvaluator.forRowFilter(equal("uuid", aboveMax), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid above upper bound").isFalse(); + } + + @Test + public void testUuidLt() { + // With RFC comparison, belowMin is below the lower bound so no rows can be < belowMin. + // With signed comparison, UUID_MIN_VALUE.compareTo(belowMin) = -1 (lower < lit), + // so rows might match. We try both comparators and return true if either matches. + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(lessThan("uuid", belowMin), SPEC, true).eval(FILE); + assertThat(shouldRead) + .as("Should read: signed UUID fallback finds matches with inverted bounds") + .isTrue(); + + // UUID_MIN_VALUE is the lower bound, so no rows can be < UUID_MIN_VALUE. + // Both RFC and signed comparators agree on this since we're comparing the value to itself. + shouldRead = + ManifestEvaluator.forRowFilter(lessThan("uuid", UUID_MIN_VALUE), SPEC, true).eval(FILE); + assertThat(shouldRead) + .as("Should not read: uuid range below lower bound (UUID_MIN is not < UUID_MIN)") + .isFalse(); + + UUID justAboveMin = UUID.fromString("00000000-0000-0001-0000-000000000000"); + shouldRead = + ManifestEvaluator.forRowFilter(lessThan("uuid", justAboveMin), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(lessThan("uuid", UUID_MAX_VALUE), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = ManifestEvaluator.forRowFilter(lessThan("uuid", aboveMax), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + } + + @Test + public void testUuidLtEq() { + // With RFC comparison, belowMin is below the lower bound so no rows can be <= belowMin. + // With signed comparison, the bounds are inverted, so rows might match. + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", belowMin), SPEC, true).eval(FILE); + assertThat(shouldRead) + .as("Should read: signed UUID fallback finds matches with inverted bounds") + .isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", UUID_MIN_VALUE), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", UUID_MAX_VALUE), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + ManifestEvaluator.forRowFilter(lessThanOrEqual("uuid", aboveMax), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + } + + @Test + public void testUuidGt() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("uuid", belowMin), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("uuid", UUID_MIN_VALUE), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue(); + + UUID justBelowMax = UUID.fromString("ffffffff-ffff-fffe-ffff-ffffffffffff"); + shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("uuid", justBelowMax), SPEC, true).eval(FILE); + assertThat(shouldRead) + .as("Should read: uuid between justBelowMax and the upper bound") + .isTrue(); + + // UUID_MAX_VALUE is the upper bound, so no rows can be > UUID_MAX_VALUE. + // Both RFC and signed comparators agree on this since we're comparing the value to itself. + shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("uuid", UUID_MAX_VALUE), SPEC, true).eval(FILE); + assertThat(shouldRead) + .as("Should not read: uuid range above upper bound (UUID_MAX is not > UUID_MAX)") + .isFalse(); + + // With RFC comparison, aboveMax is above the upper bound so no rows can be > aboveMax. + // With signed comparison, UUID_MAX_VALUE.compareTo(aboveMax) = 1 (upper > lit), + // so rows might match. We try both comparators and return true if either matches. + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("uuid", aboveMax), SPEC, true).eval(FILE); + assertThat(shouldRead) + .as("Should read: signed UUID fallback finds matches with inverted bounds") + .isTrue(); + } + + @Test + public void testUuidGtEq() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", belowMin), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", UUID_MIN_VALUE), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + shouldRead = + ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", UUID_MAX_VALUE), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + // With RFC comparison, aboveMax is above the upper bound so no rows can be >= aboveMax. + // With signed comparison, UUID_MAX_VALUE.compareTo(aboveMax) = 1 (upper > lit), + // so rows might match. We try both comparators and return true if either matches. + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + ManifestEvaluator.forRowFilter(greaterThanOrEqual("uuid", aboveMax), SPEC, true).eval(FILE); + assertThat(shouldRead) + .as("Should read: signed UUID fallback finds matches with inverted bounds") + .isTrue(); + } + + @Test + public void testUuidIn() { + UUID belowMin1 = UUID.fromString("00000000-0000-0000-0000-000000000000"); + UUID belowMin2 = UUID.fromString("00000000-0000-0000-0000-000000000001"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(in("uuid", belowMin1, belowMin2), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should not read: uuids below lower bound").isFalse(); + + shouldRead = + ManifestEvaluator.forRowFilter(in("uuid", belowMin1, UUID_MIN_VALUE), SPEC, true) + .eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to lower bound").isTrue(); + + UUID middle1 = UUID.fromString("7fffffff-ffff-ffff-0000-000000000000"); + UUID middle2 = UUID.fromString("7fffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + ManifestEvaluator.forRowFilter(in("uuid", middle1, middle2), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuids between lower and upper bounds").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + ManifestEvaluator.forRowFilter(in("uuid", UUID_MAX_VALUE, aboveMax), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to upper bound").isTrue(); + + UUID aboveMax2 = UUID.fromString("ffffffff-ffff-ffff-ffff-fffffffffffe"); + shouldRead = + ManifestEvaluator.forRowFilter(in("uuid", aboveMax, aboveMax2), SPEC, true).eval(FILE); + assertThat(shouldRead).as("Should not read: uuids above upper bound").isFalse(); + } + + // Tests for legacy UUID file compatibility (files written with signed UUID comparator). + // These tests simulate manifests where min/max bounds were computed using Java's signed + // comparison. + // + // Java's UUID.compareTo() compares MSB first, then LSB, both as SIGNED longs: + // - UUIDs starting with 0x00-0x7F have positive MSB (as signed long) + // - UUIDs starting with 0x80-0xFF have negative MSB (as signed long) + // + // For a file containing UUIDs 0x00..., 0x40..., 0x80...: + // - Unsigned (RFC) order: 0x00... < 0x40... < 0x80... + // - Signed (Java) order: 0x80... < 0x00... < 0x40... + // + // With signed comparator, the manifest has min=0x80..., max=0x40... (inverted bounds). + + @Test + public void testLegacyUuidManifestIn() { + // 0x20... and 0x30... are between 0x80... and 0x40... in signed order. + // RFC unsigned filters them out (both < 0x80...), but legacy signed includes them. + UUID uuid1 = UUID.fromString("20000000-0000-0000-0000-000000000001"); + UUID uuid2 = UUID.fromString("30000000-0000-0000-0000-000000000001"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(in("uuid", uuid1, uuid2), SPEC, true) + .eval(LEGACY_UUID_MANIFEST); + assertThat(shouldRead) + .as("Should read: legacy fallback should find UUIDs in signed-ordered bounds") + .isTrue(); + } + + @Test + public void testLegacyUuidManifestInAllBelowLowerBound() { + // 0x50... and 0x60... are outside bounds in both orderings: + // - RFC unsigned: both < 0x80... (lower bound) + // - Signed: both > 0x40... (upper bound) + UUID uuid1 = UUID.fromString("50000000-0000-0000-0000-000000000001"); + UUID uuid2 = UUID.fromString("60000000-0000-0000-0000-000000000001"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(in("uuid", uuid1, uuid2), SPEC, true) + .eval(LEGACY_UUID_MANIFEST); + assertThat(shouldRead) + .as("Should not read: UUIDs outside bounds in both RFC and signed order") + .isFalse(); + } + + @Test + public void testLegacyUuidManifestEq() { + // 0x20... is between 0x80... and 0x40... in signed order (within the manifest's actual range). + // RFC unsigned fails (0x20... < 0x80...), but legacy signed succeeds. + UUID queryUuid = UUID.fromString("20000000-0000-0000-0000-000000000001"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(equal("uuid", queryUuid), SPEC, true) + .eval(LEGACY_UUID_MANIFEST); + assertThat(shouldRead) + .as("Should read: legacy fallback should find UUID in signed-ordered bounds") + .isTrue(); + } + + @Test + public void testLegacyUuidManifestLt() { + // In signed order, 0x80... < 0x30..., so the manifest's min satisfies uuid < 0x30... + UUID queryUuid = UUID.fromString("30000000-0000-0000-0000-000000000001"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(lessThan("uuid", queryUuid), SPEC, true) + .eval(LEGACY_UUID_MANIFEST); + assertThat(shouldRead) + .as("Should read: legacy fallback should find UUIDs less than query in signed order") + .isTrue(); + } + + @Test + public void testLegacyUuidManifestGt() { + // In signed order, 0x40... > 0x20..., so the manifest's max satisfies uuid > 0x20... + UUID queryUuid = UUID.fromString("20000000-0000-0000-0000-000000000001"); + boolean shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("uuid", queryUuid), SPEC, true) + .eval(LEGACY_UUID_MANIFEST); + assertThat(shouldRead) + .as("Should read: legacy fallback should find UUIDs greater than query in signed order") + .isTrue(); + } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java index 5f0ca2659fbf..26ae8ba08ec6 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java @@ -42,6 +42,7 @@ import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.util.List; +import java.util.UUID; import org.apache.iceberg.DataFile; import org.apache.iceberg.Schema; import org.apache.iceberg.TestHelpers.Row; @@ -71,7 +72,8 @@ public class TestInclusiveMetricsEvaluator { optional(11, "all_nans_v1_stats", Types.FloatType.get()), optional(12, "nan_and_null_only", Types.DoubleType.get()), optional(13, "no_nan_stats", Types.DoubleType.get()), - optional(14, "some_empty", Types.StringType.get())); + optional(14, "some_empty", Types.StringType.get()), + optional(15, "uuid", Types.UUIDType.get())); private static final Schema NESTED_SCHEMA = new Schema( @@ -91,6 +93,38 @@ public class TestInclusiveMetricsEvaluator { private static final int INT_MIN_VALUE = 30; private static final int INT_MAX_VALUE = 79; + // UUIDs that demonstrate the difference between Java's natural order and byte-order comparison + // UUID_MIN has all zeros in MSB, all ones in LSB: 00000000-0000-0000-ffff-ffffffffffff + // UUID_MAX has all ones in MSB, all zeros in LSB: ffffffff-ffff-ffff-0000-000000000000 + // With byte-order comparison (correct): UUID_MIN < UUID_MAX (0x00... < 0xFF...) + // With Java's natural order (incorrect): UUID_MIN > UUID_MAX (MSB 0 > MSB -1 as signed long) + private static final UUID UUID_MIN_VALUE = + UUID.fromString("00000000-0000-0000-ffff-ffffffffffff"); + private static final UUID UUID_MAX_VALUE = + UUID.fromString("ffffffff-ffff-ffff-0000-000000000000"); + + // Legacy file with "inverted" bounds (min > max in unsigned order) + // Used for testing backward compatibility with files written using signed UUID comparator + // File contains UUIDs from 0x80... to 0x40... (in signed order) + // This represents actual values: 0x80..., 0x00..., 0x20..., 0x40... + private static final UUID LEGACY_SIGNED_MIN = + UUID.fromString("80000000-0000-0000-0000-000000000001"); // a small value in signed order + private static final UUID LEGACY_SIGNED_MAX = + UUID.fromString("40000000-0000-0000-0000-000000000001"); // a large value in signed order + + private static final DataFile LEGACY_UUID_FILE = + new TestDataFile( + "legacy_uuid_file.avro", + Row.of(), + 50, + ImmutableMap.of(15, 50L), + ImmutableMap.of(15, 0L), + null, + // lower bound: 0x80... (smallest in signed comparison) + ImmutableMap.of(15, toByteBuffer(Types.UUIDType.get(), LEGACY_SIGNED_MIN)), + // upper bound: 0x40... (largest in signed comparison) + ImmutableMap.of(15, toByteBuffer(Types.UUIDType.get(), LEGACY_SIGNED_MAX))); + private static final DataFile FILE = new TestDataFile( "file.avro", @@ -109,6 +143,7 @@ public class TestInclusiveMetricsEvaluator { .put(12, 50L) .put(13, 50L) .put(14, 50L) + .put(15, 50L) .buildOrThrow(), // null value counts ImmutableMap.builder() @@ -119,6 +154,7 @@ public class TestInclusiveMetricsEvaluator { .put(11, 0L) .put(12, 1L) .put(14, 0L) + .put(15, 0L) .buildOrThrow(), // nan value counts ImmutableMap.of( @@ -130,13 +166,15 @@ public class TestInclusiveMetricsEvaluator { 1, toByteBuffer(IntegerType.get(), INT_MIN_VALUE), 11, toByteBuffer(Types.FloatType.get(), Float.NaN), 12, toByteBuffer(Types.DoubleType.get(), Double.NaN), - 14, toByteBuffer(Types.StringType.get(), "")), + 14, toByteBuffer(Types.StringType.get(), ""), + 15, toByteBuffer(Types.UUIDType.get(), UUID_MIN_VALUE)), // upper bounds ImmutableMap.of( 1, toByteBuffer(IntegerType.get(), INT_MAX_VALUE), 11, toByteBuffer(Types.FloatType.get(), Float.NaN), 12, toByteBuffer(Types.DoubleType.get(), Double.NaN), - 14, toByteBuffer(Types.StringType.get(), "房东整租霍营小区二层两居室"))); + 14, toByteBuffer(Types.StringType.get(), "房东整租霍营小区二层两居室"), + 15, toByteBuffer(Types.UUIDType.get(), UUID_MAX_VALUE))); private static final DataFile FILE_2 = new TestDataFile( @@ -1138,4 +1176,287 @@ public void testNotInWithSingleValue() { .as("Should read: file has NaN values which match NOT IN predicate") .isTrue(); } + + @Test + public void testUuidEq() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", belowMin)).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid below lower bound").isFalse(); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", UUID_MIN_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to lower bound").isTrue(); + + UUID between = UUID.fromString("7fffffff-ffff-ffff-7fff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", between)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue(); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", UUID_MAX_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to upper bound").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", aboveMax)).eval(FILE); + assertThat(shouldRead).as("Should not read: uuid above upper bound").isFalse(); + } + + @Test + public void testUuidLt() { + // With RFC comparison, belowMin is below the lower bound so no rows can be < belowMin. + // With signed comparison, UUID_MIN_VALUE.compareTo(belowMin) = -1 (lower < lit), + // so rows might match. We try both comparators and return true if either matches. + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", belowMin)).eval(FILE); + assertThat(shouldRead) + .as("Should read: signed UUID fallback finds matches with inverted bounds") + .isTrue(); + + // UUID_MIN_VALUE is the lower bound, so no rows can be < UUID_MIN_VALUE. + // Both RFC and signed comparators agree on this since we're comparing the value to itself. + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", UUID_MIN_VALUE)).eval(FILE); + assertThat(shouldRead) + .as("Should not read: uuid range below lower bound (UUID_MIN is not < UUID_MIN)") + .isFalse(); + + UUID justAboveMin = UUID.fromString("00000000-0000-0001-0000-000000000000"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", justAboveMin)).eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", UUID_MAX_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", aboveMax)).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + } + + @Test + public void testUuidLtEq() { + // With RFC comparison, belowMin is below the lower bound so no rows can be <= belowMin. + // However, we also try signed comparison for backward compatibility, and with signed comparison + // the bounds are inverted, so rows might match. + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("uuid", belowMin)).eval(FILE); + assertThat(shouldRead) + .as("Should read: signed UUID fallback may find matches with inverted bounds") + .isTrue(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("uuid", UUID_MIN_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("uuid", UUID_MAX_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("uuid", aboveMax)).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + } + + @Test + public void testUuidGt() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", belowMin)).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", UUID_MIN_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid between lower and upper bounds").isTrue(); + + UUID justBelowMax = UUID.fromString("ffffffff-ffff-fffe-ffff-ffffffffffff"); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", justBelowMax)).eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + // UUID_MAX_VALUE is the upper bound, so no rows can be > UUID_MAX_VALUE. + // Both RFC and signed comparators agree on this since we're comparing the value to itself. + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", UUID_MAX_VALUE)).eval(FILE); + assertThat(shouldRead) + .as("Should not read: uuid range above upper bound (UUID_MAX is not > UUID_MAX)") + .isFalse(); + + // With RFC comparison, aboveMax is above the upper bound so no rows can be > aboveMax. + // With signed comparison, UUID_MAX_VALUE.compareTo(aboveMax) = 1 (upper > lit), + // so rows might match. We try both comparators and return true if either matches. + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", aboveMax)).eval(FILE); + assertThat(shouldRead) + .as("Should read: signed UUID fallback finds matches with inverted bounds") + .isTrue(); + } + + @Test + public void testUuidGtEq() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("uuid", belowMin)).eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("uuid", UUID_MIN_VALUE)) + .eval(FILE); + assertThat(shouldRead).as("Should read: all uuids in range").isTrue(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("uuid", UUID_MAX_VALUE)) + .eval(FILE); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + // With RFC comparison, aboveMax is above the upper bound so no rows can be >= aboveMax. + // With signed comparison, UUID_MAX_VALUE.compareTo(aboveMax) = 1 (upper > lit), + // so rows might match. We try both comparators and return true if either matches. + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("uuid", aboveMax)).eval(FILE); + assertThat(shouldRead) + .as("Should read: signed UUID fallback finds matches with inverted bounds") + .isTrue(); + } + + @Test + public void testUuidIn() { + UUID belowMin1 = UUID.fromString("00000000-0000-0000-0000-000000000000"); + UUID belowMin2 = UUID.fromString("00000000-0000-0000-0000-000000000001"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, in("uuid", belowMin1, belowMin2)).eval(FILE); + assertThat(shouldRead).as("Should not read: uuids below lower bound").isFalse(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, in("uuid", belowMin1, UUID_MIN_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to lower bound").isTrue(); + + UUID middle1 = UUID.fromString("7fffffff-ffff-ffff-0000-000000000000"); + UUID middle2 = UUID.fromString("7fffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, in("uuid", middle1, middle2)).eval(FILE); + assertThat(shouldRead).as("Should read: uuids between lower and upper bounds").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, in("uuid", UUID_MAX_VALUE, aboveMax)).eval(FILE); + assertThat(shouldRead).as("Should read: uuid equal to upper bound").isTrue(); + + UUID aboveMax2 = UUID.fromString("ffffffff-ffff-ffff-ffff-fffffffffffe"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, in("uuid", aboveMax, aboveMax2)).eval(FILE); + assertThat(shouldRead).as("Should not read: uuids above upper bound").isFalse(); + } + + @Test + public void testUuidNotEq() { + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notEqual("uuid", belowMin)).eval(FILE); + assertThat(shouldRead).as("Should read: notEqual always reads").isTrue(); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("uuid", UUID_MIN_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: notEqual always reads").isTrue(); + + UUID middle = UUID.fromString("7fffffff-ffff-ffff-7fff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("uuid", middle)).eval(FILE); + assertThat(shouldRead).as("Should read: notEqual always reads").isTrue(); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("uuid", UUID_MAX_VALUE)).eval(FILE); + assertThat(shouldRead).as("Should read: notEqual always reads").isTrue(); + + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("uuid", aboveMax)).eval(FILE); + assertThat(shouldRead).as("Should read: notEqual always reads").isTrue(); + } + + @Test + public void testUuidNotIn() { + UUID belowMin1 = UUID.fromString("00000000-0000-0000-0000-000000000000"); + UUID belowMin2 = UUID.fromString("00000000-0000-0000-0000-000000000001"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notIn("uuid", belowMin1, belowMin2)).eval(FILE); + assertThat(shouldRead).as("Should read: notIn always reads").isTrue(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notIn("uuid", UUID_MIN_VALUE, UUID_MAX_VALUE)) + .eval(FILE); + assertThat(shouldRead).as("Should read: notIn always reads").isTrue(); + + UUID middle1 = UUID.fromString("7fffffff-ffff-ffff-0000-000000000000"); + UUID middle2 = UUID.fromString("7fffffff-ffff-ffff-ffff-ffffffffffff"); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notIn("uuid", middle1, middle2)).eval(FILE); + assertThat(shouldRead).as("Should read: notIn always reads").isTrue(); + } + + // Tests for legacy UUID file compatibility (files written with signed UUID comparator). + // These tests simulate files where min/max bounds were computed using Java's signed comparison. + // + // Java's UUID.compareTo() compares MSB first, then LSB, both as SIGNED longs: + // - UUIDs starting with 0x00-0x7F have positive MSB (as signed long) + // - UUIDs starting with 0x80-0xFF have negative MSB (as signed long) + // + // For a file containing UUIDs 0x00..., 0x40..., 0x80...: + // - Unsigned (RFC) order: 0x00... < 0x40... < 0x80... + // - Signed (Java) order: 0x80... < 0x00... < 0x40... + // + // With signed comparator, the file has min=0x80..., max=0x40... (inverted bounds). + // + // Querying for 0x20... (which exists in the file): + // - RFC unsigned: 0x20... < 0x80... (lower bound), so incorrectly skipped + // - Legacy signed: 0x80... < 0x20... < 0x40..., so correctly included + + @Test + public void testLegacyUuidFileEq() { + // 0x20... is between 0x80... and 0x40... in signed order (within the file's actual range). + // RFC unsigned fails (0x20... < 0x80...), but legacy signed succeeds. + UUID queryUuid = UUID.fromString("20000000-0000-0000-0000-000000000001"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, equal("uuid", queryUuid)).eval(LEGACY_UUID_FILE); + assertThat(shouldRead) + .as("Should read: legacy fallback should find UUID in signed-ordered bounds") + .isTrue(); + } + + @Test + public void testLegacyUuidFileLt() { + // In signed order, 0x80... < 0x30..., so the file's min satisfies uuid < 0x30... + UUID queryUuid = UUID.fromString("30000000-0000-0000-0000-000000000001"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThan("uuid", queryUuid)).eval(LEGACY_UUID_FILE); + assertThat(shouldRead) + .as("Should read: legacy fallback should find UUIDs less than query in signed order") + .isTrue(); + } + + @Test + public void testLegacyUuidFileGt() { + // In signed order, 0x40... > 0x20..., so the file's max satisfies uuid > 0x20... + UUID queryUuid = UUID.fromString("20000000-0000-0000-0000-000000000001"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThan("uuid", queryUuid)) + .eval(LEGACY_UUID_FILE); + assertThat(shouldRead) + .as("Should read: legacy fallback should find UUIDs greater than query in signed order") + .isTrue(); + } + + @Test + public void testLegacyUuidFileIn() { + // 0x20... and 0x30... are between 0x80... and 0x40... in signed order. + // RFC unsigned filters them out (both < 0x80...), but legacy signed includes them. + UUID uuid1 = UUID.fromString("20000000-0000-0000-0000-000000000001"); + UUID uuid2 = UUID.fromString("30000000-0000-0000-0000-000000000001"); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, in("uuid", uuid1, uuid2)).eval(LEGACY_UUID_FILE); + assertThat(shouldRead) + .as("Should read: legacy fallback should find UUIDs in signed-ordered bounds") + .isTrue(); + } + + @Test + public void testNonUuidPredicateNoLegacyFallback() { + // Non-UUID predicates should not trigger legacy fallback (no performance impact). + boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("id", 50)).eval(FILE); + assertThat(shouldRead).as("Should read: id 50 is in range [30, 79]").isTrue(); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("id", 100)).eval(FILE); + assertThat(shouldRead).as("Should not read: id 100 is above range [30, 79]").isFalse(); + } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java index f34cd730df77..0c243b2f17a5 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java @@ -39,6 +39,7 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; +import java.util.UUID; import org.apache.iceberg.DataFile; import org.apache.iceberg.Schema; import org.apache.iceberg.TestHelpers.Row; @@ -73,7 +74,8 @@ public class TestStrictMetricsEvaluator { Types.StructType.of( Types.NestedField.optional(16, "nested_col_no_stats", Types.IntegerType.get()), Types.NestedField.optional( - 17, "nested_col_with_stats", Types.IntegerType.get())))); + 17, "nested_col_with_stats", Types.IntegerType.get()))), + optional(18, "uuid", Types.UUIDType.get())); private static final int INT_MIN_VALUE = 30; private static final int INT_MAX_VALUE = 79; @@ -172,6 +174,53 @@ public class TestStrictMetricsEvaluator { // upper bounds ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb"))); + // UUIDs for testing dual-comparator behavior with high-bit values + // These UUIDs span the signed/unsigned comparison boundary + private static final UUID UUID_MIN = UUID.fromString("00000000-0000-0000-0000-000000000001"); + private static final UUID UUID_MAX = UUID.fromString("80000000-0000-0000-0000-000000000001"); + + private static final DataFile UUID_FILE = + new TestDataFile( + "uuid_file.avro", + Row.of(), + 50, + ImmutableMap.of(18, 50L), + ImmutableMap.of(18, 0L), + null, + ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), UUID_MIN)), + ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), UUID_MAX))); + + // File with single UUID value (lower == upper) + private static final UUID SINGLE_UUID = UUID.fromString("40000000-0000-0000-0000-000000000001"); + private static final DataFile SINGLE_UUID_FILE = + new TestDataFile( + "single_uuid_file.avro", + Row.of(), + 50, + ImmutableMap.of(18, 50L), + ImmutableMap.of(18, 0L), + null, + ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), SINGLE_UUID)), + ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), SINGLE_UUID))); + + // File with inverted UUID bounds (as would be written by legacy signed comparator) + // In RFC unsigned order: 0x40... < 0x80..., so lower > upper when interpreted with RFC + private static final UUID LEGACY_UUID_LOWER = + UUID.fromString("80000000-0000-0000-0000-000000000001"); + private static final UUID LEGACY_UUID_UPPER = + UUID.fromString("40000000-0000-0000-0000-000000000001"); + + private static final DataFile LEGACY_UUID_FILE = + new TestDataFile( + "legacy_uuid_file.avro", + Row.of(), + 50, + ImmutableMap.of(18, 50L), + ImmutableMap.of(18, 0L), + null, + ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_LOWER)), + ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), LEGACY_UUID_UPPER))); + @Test public void testAllNulls() { boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notNull("all_nulls")).eval(FILE); @@ -684,4 +733,146 @@ SCHEMA, lessThanOrEqual("struct.nested_col_with_stats", INT_MAX_VALUE)) new StrictMetricsEvaluator(SCHEMA, notNull("struct.nested_col_with_stats")).eval(FILE); assertThat(shouldRead).as("notNull nested column should not match").isFalse(); } + + // Tests for UUID with StrictMetricsEvaluator using RFC-compliant comparison only + // UUID_FILE has bounds [UUID_MIN=0x00...01, UUID_MAX=0x80...01] + + @Test + public void testStrictUuidGt() { + // Query: uuid > belowMin (all UUIDs in file should be > this) + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean allMatch = + new StrictMetricsEvaluator(SCHEMA, greaterThan("uuid", belowMin)).eval(UUID_FILE); + assertThat(allMatch).as("All UUIDs should be greater than belowMin").isTrue(); + + // Query: uuid > UUID_MIN (lower bound) - not all values are > lower bound + allMatch = new StrictMetricsEvaluator(SCHEMA, greaterThan("uuid", UUID_MIN)).eval(UUID_FILE); + assertThat(allMatch).as("Not all UUIDs are greater than lower bound").isFalse(); + + // Query: uuid > middle value - not all values are > middle + UUID middle = UUID.fromString("40000000-0000-0000-0000-000000000001"); + allMatch = new StrictMetricsEvaluator(SCHEMA, greaterThan("uuid", middle)).eval(UUID_FILE); + assertThat(allMatch).as("Not all UUIDs are greater than middle").isFalse(); + + // Query: uuid > UUID_MAX (upper bound) - no values are > upper bound + allMatch = new StrictMetricsEvaluator(SCHEMA, greaterThan("uuid", UUID_MAX)).eval(UUID_FILE); + assertThat(allMatch).as("No UUIDs are greater than upper bound").isFalse(); + + // Query: uuid > aboveMax - no values are > aboveMax + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + allMatch = new StrictMetricsEvaluator(SCHEMA, greaterThan("uuid", aboveMax)).eval(UUID_FILE); + assertThat(allMatch).as("No UUIDs are greater than aboveMax").isFalse(); + } + + @Test + public void testStrictUuidLt() { + // Query: uuid < belowMin - no values are < belowMin + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + boolean allMatch = + new StrictMetricsEvaluator(SCHEMA, lessThan("uuid", belowMin)).eval(UUID_FILE); + assertThat(allMatch).as("No UUIDs are less than belowMin").isFalse(); + + // Query: uuid < UUID_MIN (lower bound) - no values are < lower bound + allMatch = new StrictMetricsEvaluator(SCHEMA, lessThan("uuid", UUID_MIN)).eval(UUID_FILE); + assertThat(allMatch).as("No UUIDs are less than lower bound").isFalse(); + + // Query: uuid < middle value - not all values are < middle + UUID middle = UUID.fromString("40000000-0000-0000-0000-000000000001"); + allMatch = new StrictMetricsEvaluator(SCHEMA, lessThan("uuid", middle)).eval(UUID_FILE); + assertThat(allMatch).as("Not all UUIDs are less than middle").isFalse(); + + // Query: uuid < UUID_MAX (upper bound) - not all values are < upper bound + allMatch = new StrictMetricsEvaluator(SCHEMA, lessThan("uuid", UUID_MAX)).eval(UUID_FILE); + assertThat(allMatch).as("Not all UUIDs are less than upper bound").isFalse(); + + // Query: uuid < aboveMax (all UUIDs in file should be < this) + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + allMatch = new StrictMetricsEvaluator(SCHEMA, lessThan("uuid", aboveMax)).eval(UUID_FILE); + assertThat(allMatch).as("All UUIDs should be less than aboveMax").isTrue(); + } + + @Test + public void testStrictUuidEqNeverMatchesRange() { + // Strict eq should never match when there's a range of values + UUID middle = UUID.fromString("40000000-0000-0000-0000-000000000001"); + boolean allMatch = new StrictMetricsEvaluator(SCHEMA, equal("uuid", middle)).eval(UUID_FILE); + assertThat(allMatch).as("Strict eq should not match range").isFalse(); + } + + @Test + public void testStrictUuidInNeverMatchesRange() { + // Strict IN should never match when there's a range of values (lower != upper) + UUID middle1 = UUID.fromString("40000000-0000-0000-0000-000000000001"); + UUID middle2 = UUID.fromString("50000000-0000-0000-0000-000000000001"); + boolean allMatch = + new StrictMetricsEvaluator(SCHEMA, in("uuid", middle1, middle2)).eval(UUID_FILE); + assertThat(allMatch).as("Strict IN should not match range").isFalse(); + } + + @Test + public void testStrictUuidInMatchesSingleValue() { + // Strict IN should match when lower == upper and the value is in the set + boolean allMatch = + new StrictMetricsEvaluator(SCHEMA, in("uuid", SINGLE_UUID)).eval(SINGLE_UUID_FILE); + assertThat(allMatch).as("Strict IN should match single value in set").isTrue(); + } + + @Test + public void testStrictUuidInDoesNotMatchWhenValueNotInSet() { + // Strict IN should not match when lower == upper but the value is not in the set + UUID otherUuid = UUID.fromString("50000000-0000-0000-0000-000000000001"); + boolean allMatch = + new StrictMetricsEvaluator(SCHEMA, in("uuid", otherUuid)).eval(SINGLE_UUID_FILE); + assertThat(allMatch).as("Strict IN should not match when value not in set").isFalse(); + } + + @Test + public void testStrictUuidNotInMatchesWhenAllValuesOutsideBounds() { + // Strict NOT IN should match when all values in the set are outside the file's bounds + UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000"); + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + boolean allMatch = + new StrictMetricsEvaluator(SCHEMA, notIn("uuid", belowMin, aboveMax)).eval(UUID_FILE); + // All values in the set are outside [UUID_MIN, UUID_MAX], so all rows match NOT IN + assertThat(allMatch).as("Strict NOT IN should match when all values outside bounds").isTrue(); + } + + @Test + public void testStrictUuidNotInDoesNotMatchWhenValueInBounds() { + // Strict NOT IN should not match when a value in the set is within bounds + UUID middle = UUID.fromString("40000000-0000-0000-0000-000000000001"); + boolean allMatch = new StrictMetricsEvaluator(SCHEMA, notIn("uuid", middle)).eval(UUID_FILE); + // middle is within [UUID_MIN, UUID_MAX], so some rows might match the value + assertThat(allMatch).as("Strict NOT IN should not match when value in bounds").isFalse(); + } + + // Tests for file with inverted UUID bounds (as would be written by legacy signed comparator) + + @Test + public void testStrictUuidInWithLegacyInvertedBounds() { + // With inverted bounds [0x80..., 0x40...] where lower > upper in RFC order, + // strict IN should never match since lower != upper + UUID uuid1 = UUID.fromString("20000000-0000-0000-0000-000000000001"); + UUID uuid2 = UUID.fromString("30000000-0000-0000-0000-000000000001"); + boolean allMatch = + new StrictMetricsEvaluator(SCHEMA, in("uuid", uuid1, uuid2)).eval(LEGACY_UUID_FILE); + assertThat(allMatch).as("Strict IN should not match when lower != upper").isFalse(); + } + + @Test + public void testStrictUuidNotInWithLegacyInvertedBounds() { + // Query: NOT IN (0x50..., 0x60...) + // With inverted bounds [0x80..., 0x40...], in RFC order: 0x40... < 0x50... < 0x60... < 0x80... + // The values 0x50... and 0x60... are between upper (0x40...) and lower (0x80...) in RFC order + // Since bounds are inverted, the evaluator sees lower (0x80...) > values, so they appear + // below the lower bound, and strict NOT IN returns true + UUID uuid1 = UUID.fromString("50000000-0000-0000-0000-000000000001"); + UUID uuid2 = UUID.fromString("60000000-0000-0000-0000-000000000001"); + boolean allMatch = + new StrictMetricsEvaluator(SCHEMA, notIn("uuid", uuid1, uuid2)).eval(LEGACY_UUID_FILE); + // With RFC-only comparison, values appear below lower bound (0x80...), so all rows match NOT IN + assertThat(allMatch) + .as("Strict NOT IN should match when all values below lower bound in RFC order") + .isTrue(); + } } diff --git a/api/src/test/java/org/apache/iceberg/types/TestComparators.java b/api/src/test/java/org/apache/iceberg/types/TestComparators.java index 691e3f04a074..7127050d60fc 100644 --- a/api/src/test/java/org/apache/iceberg/types/TestComparators.java +++ b/api/src/test/java/org/apache/iceberg/types/TestComparators.java @@ -100,6 +100,76 @@ public void testUuid() { Comparators.forType(Types.UUIDType.get()), UUID.fromString("81873e7d-1374-4493-8e1d-9095eff7046c"), UUID.fromString("fd02441d-1423-4a3f-8785-c7dd5647e26b")); + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("00000000-0000-0000-0000-000000000000"), + UUID.fromString("60000000-0000-0000-0000-000000000000")); + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("60000000-0000-0000-0000-000000000000"), + UUID.fromString("70000000-0000-0000-0000-000000000000")); + // The following assertions fail prior to the introduction of unsigned UUID comparator. + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("70000000-0000-0000-0000-000000000000"), + UUID.fromString("80000000-0000-0000-0000-000000000000")); + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("80000000-0000-0000-0000-000000000000"), + UUID.fromString("90000000-0000-0000-0000-000000000000")); + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("90000000-0000-0000-0000-000000000000"), + UUID.fromString("a0000000-0000-0000-0000-000000000000")); + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("a0000000-0000-0000-0000-000000000000"), + UUID.fromString("f0000000-0000-0000-0000-000000000000")); + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), + UUID.fromString("ffffffff-ffff-ffff-ffff-fffffffffffe"), + UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff")); + } + + @Test + public void testSignedUuid() { + // Test the signed UUID comparator used for backward compatibility with legacy files. + // In signed comparison, UUIDs with high bit set (0x80...) are treated as negative, + // so they compare less than UUIDs without high bit set (0x00-0x7F...). + Comparator signedCmp = Comparators.signedUUIDs(); + + // Within the "positive" range (0x00-0x7F), ordering is the same as unsigned + assertComparesCorrectly( + signedCmp, + UUID.fromString("00000000-0000-0000-0000-000000000000"), + UUID.fromString("40000000-0000-0000-0000-000000000000")); + assertComparesCorrectly( + signedCmp, + UUID.fromString("40000000-0000-0000-0000-000000000000"), + UUID.fromString("70000000-0000-0000-0000-000000000000")); + + // Within the "negative" range (0x80-0xFF), ordering is the same as unsigned + assertComparesCorrectly( + signedCmp, + UUID.fromString("80000000-0000-0000-0000-000000000000"), + UUID.fromString("90000000-0000-0000-0000-000000000000")); + assertComparesCorrectly( + signedCmp, + UUID.fromString("c0000000-0000-0000-0000-000000000000"), + UUID.fromString("f0000000-0000-0000-0000-000000000000")); + + // In signed comparison, 0x80... < 0x70... because the high bit is treated as negative. + // In unsigned comparison, 0x70... < 0x80... because bytes are compared as unsigned values. + UUID highBit = UUID.fromString("80000000-0000-0000-0000-000000000000"); + UUID lowBit = UUID.fromString("70000000-0000-0000-0000-000000000000"); + assertThat(signedCmp.compare(highBit, lowBit)) + .as("In signed comparison, 0x80... should be less than 0x70...") + .isLessThan(0); + + Comparator unsignedCmp = Comparators.forType(Types.UUIDType.get()); + assertThat(unsignedCmp.compare(highBit, lowBit)) + .as("In unsigned comparison, 0x80... should be greater than 0x70...") + .isGreaterThan(0); } @Test diff --git a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java index 3cb46b309d82..4c0b6b4d3844 100644 --- a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java +++ b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java @@ -41,10 +41,13 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.assertj.core.api.Assumptions.assumeThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; import java.io.File; import java.io.IOException; import java.io.UncheckedIOException; +import java.nio.ByteBuffer; import java.util.Arrays; import java.util.List; import java.util.UUID; @@ -69,21 +72,27 @@ import org.apache.iceberg.orc.ORC; import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.parquet.ParquetMetricsRowGroupFilter; +import org.apache.iceberg.parquet.ParquetSchemaUtil; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.DoubleType; import org.apache.iceberg.types.Types.FloatType; import org.apache.iceberg.types.Types.IntegerType; import org.apache.iceberg.types.Types.StringType; +import org.apache.iceberg.util.UUIDUtil; import org.apache.iceberg.variants.ShreddedObject; import org.apache.iceberg.variants.Variant; import org.apache.iceberg.variants.VariantMetadata; import org.apache.iceberg.variants.Variants; import org.apache.orc.OrcFile; import org.apache.orc.Reader; +import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.io.DelegatingSeekableInputStream; +import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.MessageType; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.TestTemplate; @@ -162,6 +171,17 @@ public static List parameters() { private static final UUID UUID_WITH_ZEROS = UUID.fromString("00000000-0000-0000-0000-000000000000"); + // UUIDs for testing dual-comparator behavior with high-bit values + // These UUIDs span the signed/unsigned comparison boundary + private static final UUID UUID_LOW = UUID.fromString("00000000-0000-0000-0000-000000000001"); + private static final UUID UUID_MID = UUID.fromString("40000000-0000-0000-0000-000000000001"); + private static final UUID UUID_HIGH = UUID.fromString("80000000-0000-0000-0000-000000000001"); + private static final UUID UUID_HIGHER = UUID.fromString("c0000000-0000-0000-0000-000000000001"); + + private static final Schema UUID_SCHEMA = + new Schema( + required(1, "id", IntegerType.get()), optional(2, "uuid_col", Types.UUIDType.get())); + private File orcFile = null; private MessageType parquetSchema = null; private BlockMetaData rowGroupMetadata = null; @@ -1138,6 +1158,252 @@ public void testUUID() { .isTrue(); } + /** + * Tests UUID filtering with values that span the signed/unsigned comparison boundary. In RFC 9562 + * unsigned comparison: UUID_LOW < UUID_MID < UUID_HIGH < UUID_HIGHER. In legacy signed + * comparison: UUID_HIGH < UUID_HIGHER < UUID_LOW < UUID_MID (high bit treated as negative). + * + *

The dual-comparator approach ensures we don't incorrectly skip files that might contain + * matching rows when reading legacy files with inverted UUID bounds. + */ + @TestTemplate + public void testUUIDWithHighBitValues() throws IOException { + assumeThat(format).as("Only valid for Parquet").isEqualTo(FileFormat.PARQUET); + + // Create a file with UUIDs spanning the signed/unsigned boundary + List records = Lists.newArrayList(); + UUID[] uuids = {UUID_LOW, UUID_MID, UUID_HIGH, UUID_HIGHER}; + for (int i = 0; i < uuids.length; i++) { + GenericRecord record = GenericRecord.create(UUID_SCHEMA); + record.setField("id", i); + record.setField("uuid_col", uuids[i]); + records.add(record); + } + + File parquetFile = writeParquetFile("uuid-high-bit-test", UUID_SCHEMA, records); + InputFile inFile = Files.localInput(parquetFile); + try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { + BlockMetaData blockMetaData = reader.getRowGroups().get(0); + MessageType fileSchema = reader.getFileMetaData().getSchema(); + + // Test equality - should find exact matches + assertThat(shouldReadUUID(equal("uuid_col", UUID_HIGH), fileSchema, blockMetaData)) + .as("Should read: file contains UUID_HIGH") + .isTrue(); + + // Test less than with high-bit UUID + // Query: uuid < UUID_HIGH (should match UUID_LOW and UUID_MID in RFC order) + assertThat(shouldReadUUID(lessThan("uuid_col", UUID_HIGH), fileSchema, blockMetaData)) + .as("Should read: file contains values less than UUID_HIGH") + .isTrue(); + + // Test greater than with low UUID + // Query: uuid > UUID_LOW (should match UUID_MID, UUID_HIGH, UUID_HIGHER in RFC order) + assertThat(shouldReadUUID(greaterThan("uuid_col", UUID_LOW), fileSchema, blockMetaData)) + .as("Should read: file contains values greater than UUID_LOW") + .isTrue(); + + // Test greater than with highest UUID - should not match any + UUID aboveAll = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + assertThat(shouldReadUUID(greaterThan("uuid_col", aboveAll), fileSchema, blockMetaData)) + .as("Should skip: no values greater than max UUID") + .isFalse(); + + // Test less than with lowest UUID - should not match any + UUID belowAll = UUID.fromString("00000000-0000-0000-0000-000000000000"); + assertThat(shouldReadUUID(lessThan("uuid_col", belowAll), fileSchema, blockMetaData)) + .as("Should skip: no values less than min UUID") + .isFalse(); + + // Test IN with high-bit UUIDs + assertThat(shouldReadUUID(in("uuid_col", UUID_HIGH, UUID_HIGHER), fileSchema, blockMetaData)) + .as("Should read: file contains one of the IN values") + .isTrue(); + + // Test IN with UUID outside file bounds (above max) + // The file's max is UUID_HIGHER (0xc0...), so 0xff... is outside bounds + UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"); + assertThat(shouldReadUUID(in("uuid_col", aboveMax), fileSchema, blockMetaData)) + .as("Should skip: value is above file's max bound") + .isFalse(); + + // Test NOT IN - should read when file contains values not in the set + assertThat(shouldReadUUID(notIn("uuid_col", UUID_LOW), fileSchema, blockMetaData)) + .as("Should read: file contains values not in the exclusion set") + .isTrue(); + } + } + + /** + * Tests that the dual-comparator approach correctly handles queries that would give different + * results with signed vs unsigned comparison. This is critical for backward compatibility with + * legacy files that may have UUID bounds computed with signed comparison. + */ + @TestTemplate + public void testUUIDComparisonBoundary() throws IOException { + assumeThat(format).as("Only valid for Parquet").isEqualTo(FileFormat.PARQUET); + + // Create a file with only high-bit UUIDs (0x80... and above) + // These are the UUIDs that would be ordered differently in signed vs unsigned comparison + List records = Lists.newArrayList(); + UUID[] highBitUuids = {UUID_HIGH, UUID_HIGHER}; + for (int i = 0; i < highBitUuids.length; i++) { + GenericRecord record = GenericRecord.create(UUID_SCHEMA); + record.setField("id", i); + record.setField("uuid_col", highBitUuids[i]); + records.add(record); + } + + File parquetFile = writeParquetFile("uuid-high-only-test", UUID_SCHEMA, records); + InputFile inFile = Files.localInput(parquetFile); + try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { + BlockMetaData blockMetaData = reader.getRowGroups().get(0); + MessageType fileSchema = reader.getFileMetaData().getSchema(); + + // Query: uuid > UUID_MID (0x40...) + // In RFC unsigned order: UUID_HIGH (0x80...) > UUID_MID, so should match + // In signed order: UUID_HIGH (0x80...) < UUID_MID (negative vs positive), so would not match + // The dual-comparator should ensure we read this file + assertThat(shouldReadUUID(greaterThan("uuid_col", UUID_MID), fileSchema, blockMetaData)) + .as("Should read: file contains UUIDs greater than UUID_MID in RFC order") + .isTrue(); + + // Query: uuid < UUID_MID (0x40...) + // In RFC unsigned order: all values (0x80..., 0xc0...) > UUID_MID, so should NOT match + // In signed order: all values would be < UUID_MID (negative < positive), so would match + // The dual-comparator returns true (should read) because signed comparison says there + // might be matches - this is the conservative behavior for backward compatibility + assertThat(shouldReadUUID(lessThan("uuid_col", UUID_MID), fileSchema, blockMetaData)) + .as("Should read: dual-comparator is conservative for legacy file compatibility") + .isTrue(); + + // Query: uuid > UUID_LOW (0x00...) + // Both comparators agree: all high-bit UUIDs are greater than UUID_LOW + assertThat(shouldReadUUID(greaterThan("uuid_col", UUID_LOW), fileSchema, blockMetaData)) + .as("Should read: all values are greater than UUID_LOW") + .isTrue(); + } + } + + /** + * Tests reading a legacy Parquet file's metadata where UUID min/max statistics were computed + * using signed comparison. + * + *

In signed comparison, UUIDs with high bit set (0x80...) are treated as negative, so they + * sort before UUIDs without high bit set. This creates "inverted bounds" where min > max in RFC + * unsigned order. + * + *

This test uses mocked Parquet metadata to simulate such a legacy file, verifying that the + * dual-comparator approach correctly handles queries that would otherwise be incorrectly filtered + * out using the unsigned comparison. + */ + @TestTemplate + public void testLegacyUUIDParquetFileWithSignedComparator() throws IOException { + assumeThat(format).as("Only valid for Parquet").isEqualTo(FileFormat.PARQUET); + + MessageType legacySchema = ParquetSchemaUtil.convert(UUID_SCHEMA, "legacy_uuid_test"); + + // Simulate inverted bounds from a legacy file written with signed comparator. + // In signed order: 0x80... (negative) < 0x00... < 0x20... < 0x40... (positive) + // So min=0x80..., max=0x40... represents a valid range in signed order, + // but appears inverted in RFC unsigned order where 0x40... < 0x80... + // Therefore, with unsigned UUID expression evaluation, the Parquet row group is not read. + UUID legacyMin = UUID.fromString("80000000-0000-0000-0000-000000000001"); + UUID legacyMax = UUID.fromString("40000000-0000-0000-0000-000000000001"); + + BlockMetaData mockRowGroup = createMockRowGroupWithUUIDStats(legacyMin, legacyMax); + + // Test 1: equal(0x20...) + // In unsigned order: 0x20... < 0x80... (min), fails lower bound check + // In signed order: 0x80... < 0x20... < 0x40..., passes bounds check + UUID queryInRange = UUID.fromString("20000000-0000-0000-0000-000000000001"); + assertThat(shouldReadUUID(equal("uuid_col", queryInRange), legacySchema, mockRowGroup)) + .as("Should read: signed fallback finds UUID in signed-ordered bounds") + .isTrue(); + + // Test 2: lessThan(0x30...) + // In unsigned order: min (0x80...) > 0x30..., fails lower bound check + // In signed order: min (0x80...) < 0x30..., passes lower bound check + UUID queryLt = UUID.fromString("30000000-0000-0000-0000-000000000001"); + assertThat(shouldReadUUID(lessThan("uuid_col", queryLt), legacySchema, mockRowGroup)) + .as("Should read: signed fallback finds UUIDs less than query in signed order") + .isTrue(); + + // Test 3: greaterThan(0x20...) + // In unsigned order: max (0x40...) < 0x80... (min), bounds are inverted + // In signed order: max (0x40...) > 0x20..., passes upper bound check + assertThat(shouldReadUUID(greaterThan("uuid_col", queryInRange), legacySchema, mockRowGroup)) + .as("Should read: signed fallback finds UUIDs greater than query in signed order") + .isTrue(); + + // Test 4: in(0x20..., 0x30...) + // In unsigned order: both values < 0x80... (min), fails lower bound check + // In signed order: both values between 0x80... and 0x40..., passes bounds check + assertThat(shouldReadUUID(in("uuid_col", queryInRange, queryLt), legacySchema, mockRowGroup)) + .as("Should read: signed fallback finds UUIDs in signed-ordered bounds") + .isTrue(); + + // Test 5: equal(0x50...) + // In unsigned order: 0x50... < 0x80... (min), fails lower bound check + // In signed order: 0x50... > 0x40... (max), fails upper bound check + UUID outsideBounds = UUID.fromString("50000000-0000-0000-0000-000000000001"); + assertThat(shouldReadUUID(equal("uuid_col", outsideBounds), legacySchema, mockRowGroup)) + .as("Should not read: UUID outside bounds in both RFC and signed order") + .isFalse(); + } + + /** + * Creates a mock BlockMetaData with UUID column statistics set to the specified min/max values. + * This simulates a legacy Parquet file where statistics were computed using signed comparison. + */ + private BlockMetaData createMockRowGroupWithUUIDStats(UUID minUuid, UUID maxUuid) { + // Convert UUIDs to Binary (16-byte fixed-length) + ByteBuffer minBuffer = UUIDUtil.convertToByteBuffer(minUuid); + ByteBuffer maxBuffer = UUIDUtil.convertToByteBuffer(maxUuid); + Binary minBinary = Binary.fromConstantByteBuffer(minBuffer); + Binary maxBinary = Binary.fromConstantByteBuffer(maxBuffer); + + // Create mock Statistics + @SuppressWarnings("unchecked") + Statistics mockStats = mock(Statistics.class); + when(mockStats.isEmpty()).thenReturn(false); + when(mockStats.hasNonNullValue()).thenReturn(true); + when(mockStats.isNumNullsSet()).thenReturn(true); + when(mockStats.getNumNulls()).thenReturn(0L); + when(mockStats.genericGetMin()).thenReturn(minBinary); + when(mockStats.genericGetMax()).thenReturn(maxBinary); + when(mockStats.getMinBytes()).thenReturn(minBuffer.array()); + when(mockStats.getMaxBytes()).thenReturn(maxBuffer.array()); + + // Create mock ColumnChunkMetaData for the UUID column + ColumnChunkMetaData mockUuidColumn = mock(ColumnChunkMetaData.class); + when(mockUuidColumn.getPath()).thenReturn(ColumnPath.fromDotString("uuid_col")); + when(mockUuidColumn.getStatistics()).thenReturn(mockStats); + when(mockUuidColumn.getValueCount()).thenReturn(50L); + + // Create mock ColumnChunkMetaData for the id column (required for schema) + ColumnChunkMetaData mockIdColumn = mock(ColumnChunkMetaData.class); + when(mockIdColumn.getPath()).thenReturn(ColumnPath.fromDotString("id")); + @SuppressWarnings("unchecked") + Statistics mockIdStats = mock(Statistics.class); + when(mockIdStats.isEmpty()).thenReturn(true); + when(mockIdColumn.getStatistics()).thenReturn(mockIdStats); + when(mockIdColumn.getValueCount()).thenReturn(50L); + + // Create mock BlockMetaData + BlockMetaData mockRowGroup = mock(BlockMetaData.class); + when(mockRowGroup.getRowCount()).thenReturn(50L); + when(mockRowGroup.getColumns()).thenReturn(Arrays.asList(mockIdColumn, mockUuidColumn)); + + return mockRowGroup; + } + + private boolean shouldReadUUID( + Expression expression, MessageType messageType, BlockMetaData blockMetaData) { + return new ParquetMetricsRowGroupFilter(UUID_SCHEMA, expression, true) + .shouldRead(messageType, blockMetaData); + } + private boolean shouldRead(Expression expression) { return shouldRead(expression, true); } diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java index b0d98b358b6d..0c3b60827b96 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java @@ -29,6 +29,7 @@ import org.apache.iceberg.SortKey; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Comparators; public class DataDistributionUtil { private DataDistributionUtil() {} @@ -162,7 +163,7 @@ public static UUID[] reservoirSampleUUIDs(int sampleSize, int reservoirSize) { } } - Arrays.sort(reservoir); + Arrays.sort(reservoir, Comparators.uuids()); return reservoir; } @@ -172,7 +173,7 @@ public static UUID[] rangeBoundSampleUUIDs(UUID[] sampledUUIDs, int rangeBoundSi for (int i = 0; i < rangeBoundSize; ++i) { rangeBounds[i] = sampledUUIDs[i * step]; } - Arrays.sort(rangeBounds); + Arrays.sort(rangeBounds, Comparators.uuids()); return rangeBounds; } } diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java index b0d98b358b6d..0c3b60827b96 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java @@ -29,6 +29,7 @@ import org.apache.iceberg.SortKey; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Comparators; public class DataDistributionUtil { private DataDistributionUtil() {} @@ -162,7 +163,7 @@ public static UUID[] reservoirSampleUUIDs(int sampleSize, int reservoirSize) { } } - Arrays.sort(reservoir); + Arrays.sort(reservoir, Comparators.uuids()); return reservoir; } @@ -172,7 +173,7 @@ public static UUID[] rangeBoundSampleUUIDs(UUID[] sampledUUIDs, int rangeBoundSi for (int i = 0; i < rangeBoundSize; ++i) { rangeBounds[i] = sampledUUIDs[i * step]; } - Arrays.sort(rangeBounds); + Arrays.sort(rangeBounds, Comparators.uuids()); return rangeBounds; } } diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java index b0d98b358b6d..0c3b60827b96 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/DataDistributionUtil.java @@ -29,6 +29,7 @@ import org.apache.iceberg.SortKey; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Comparators; public class DataDistributionUtil { private DataDistributionUtil() {} @@ -162,7 +163,7 @@ public static UUID[] reservoirSampleUUIDs(int sampleSize, int reservoirSize) { } } - Arrays.sort(reservoir); + Arrays.sort(reservoir, Comparators.uuids()); return reservoir; } @@ -172,7 +173,7 @@ public static UUID[] rangeBoundSampleUUIDs(UUID[] sampledUUIDs, int rangeBoundSi for (int i = 0; i < rangeBoundSize; ++i) { rangeBounds[i] = sampledUUIDs[i * step]; } - Arrays.sort(rangeBounds); + Arrays.sort(rangeBounds, Comparators.uuids()); return rangeBounds; } } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java index cae9447513c0..a459ae710222 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java @@ -30,6 +30,7 @@ import org.apache.iceberg.expressions.Bound; import org.apache.iceberg.expressions.BoundReference; import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.ExpressionUtil; import org.apache.iceberg.expressions.ExpressionVisitors; import org.apache.iceberg.expressions.ExpressionVisitors.BoundExpressionVisitor; import org.apache.iceberg.expressions.Expressions; @@ -51,6 +52,9 @@ public class ParquetMetricsRowGroupFilter { private final Schema schema; private final Expression expr; + // Expression using signed UUID comparator for backward compatibility with files written before + // RFC-compliant UUID comparisons were introduced. Null if no UUID predicates. + private final Expression signedUuidExpr; public ParquetMetricsRowGroupFilter(Schema schema, Expression unbound) { this(schema, unbound, true); @@ -59,7 +63,14 @@ public ParquetMetricsRowGroupFilter(Schema schema, Expression unbound) { public ParquetMetricsRowGroupFilter(Schema schema, Expression unbound, boolean caseSensitive) { this.schema = schema; StructType struct = schema.asStruct(); - this.expr = Binder.bind(struct, Expressions.rewriteNot(unbound), caseSensitive); + Expression rewritten = Expressions.rewriteNot(unbound); + this.expr = Binder.bind(struct, rewritten, caseSensitive); + + // Create the signed UUID expression if and only if there are UUID predicates + // that compare against bounds. + Expression transformed = ExpressionUtil.toSignedUUIDLiteral(rewritten); + this.signedUuidExpr = + transformed != null ? Binder.bind(struct, transformed, caseSensitive) : null; } /** @@ -70,18 +81,43 @@ public ParquetMetricsRowGroupFilter(Schema schema, Expression unbound, boolean c * @return false if the file cannot contain rows that match the expression, true otherwise. */ public boolean shouldRead(MessageType fileSchema, BlockMetaData rowGroup) { - return new MetricsEvalVisitor().eval(fileSchema, rowGroup); + boolean result = new MetricsEvalVisitor(schema).eval(fileSchema, rowGroup, expr, false); + + // If the RFC-compliant evaluation says rows might match, or there's no signed UUID expression, + // return the result. + if (result || signedUuidExpr == null) { + return result; + } + + // Try again with signed UUID comparator. There is no quick way to detect + // whether signed or unsigned comparator was used when the UUID column stats were written. + // The signedUuidExpr has literals with signed comparators for lt/gt/eq predicates. + // For IN predicates, signed comparator is used via Comparators#comparatorFor. + return new MetricsEvalVisitor(schema).eval(fileSchema, rowGroup, signedUuidExpr, true); } private static final boolean ROWS_MIGHT_MATCH = true; private static final boolean ROWS_CANNOT_MATCH = false; - private class MetricsEvalVisitor extends BoundExpressionVisitor { + private static class MetricsEvalVisitor extends BoundExpressionVisitor { + private final Schema schema; private Map> stats = null; private Map valueCounts = null; private Map> conversions = null; + // Flag to use signed UUID comparator for backward compatibility. + // This is specifically necessary for IN predicates because the comparator information + // is lost when binding converts literals to a Set of raw values. + private boolean useSignedUuidComparator = false; + + private MetricsEvalVisitor(Schema schema) { + this.schema = schema; + } - private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) { + private boolean eval( + MessageType fileSchema, + BlockMetaData rowGroup, + Expression expression, + boolean signedUuidMode) { if (rowGroup.getRowCount() <= 0) { return ROWS_CANNOT_MATCH; } @@ -89,6 +125,7 @@ private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) { this.stats = Maps.newHashMap(); this.valueCounts = Maps.newHashMap(); this.conversions = Maps.newHashMap(); + this.useSignedUuidComparator = signedUuidMode; for (ColumnChunkMetaData col : rowGroup.getColumns()) { PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType(); if (colType.getId() != null) { @@ -100,7 +137,7 @@ private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) { } } - return ExpressionVisitors.visitEvaluator(expr, this); + return ExpressionVisitors.visitEvaluator(expression, this); } @Override @@ -406,20 +443,19 @@ public Boolean in(BoundReference ref, Set literalSet) { return ROWS_MIGHT_MATCH; } + // use an appropriate comparator, for UUIDs use the signed comparator if requested + Comparator cmp = + Comparators.comparatorFor(ref.type(), ref.comparator(), useSignedUuidComparator); T lower = min(colStats, id); literals = - literals.stream() - .filter(v -> ref.comparator().compare(lower, v) <= 0) - .collect(Collectors.toList()); + literals.stream().filter(v -> cmp.compare(lower, v) <= 0).collect(Collectors.toList()); if (literals.isEmpty()) { // if all values are less than lower bound, rows cannot match. return ROWS_CANNOT_MATCH; } T upper = max(colStats, id); literals = - literals.stream() - .filter(v -> ref.comparator().compare(upper, v) >= 0) - .collect(Collectors.toList()); + literals.stream().filter(v -> cmp.compare(upper, v) >= 0).collect(Collectors.toList()); if (literals .isEmpty()) { // if all remaining values are greater than upper bound, rows cannot // match.