From f766f63779af992bdb65199ab8a11740314f1325 Mon Sep 17 00:00:00 2001 From: RamaKrishna Mothukuri Date: Thu, 1 Aug 2024 18:46:11 +0530 Subject: [PATCH 1/7] Fix to set dictionary_page_offset correctly when encoding_stats are missing --- .../parquet/format/converter/ParquetMetadataConverter.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 194670f2df..08f9a61a2e 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -557,8 +557,9 @@ private void addRowGroup( columnMetaData.getTotalUncompressedSize(), columnMetaData.getTotalSize(), columnMetaData.getFirstDataPageOffset()); - if (columnMetaData.getEncodingStats() != null - && columnMetaData.getEncodingStats().hasDictionaryPages()) { + if ((columnMetaData.getEncodingStats() != null + && columnMetaData.getEncodingStats().hasDictionaryPages()) + || columnMetaData.hasDictionaryPage()) { metaData.setDictionary_page_offset(columnMetaData.getDictionaryPageOffset()); } long bloomFilterOffset = columnMetaData.getBloomFilterOffset(); From 4bc53dc66e41659146a75cbc9fc2835b984b4b9d Mon Sep 17 00:00:00 2001 From: RamaKrishna Mothukuri Date: Fri, 2 Aug 2024 12:36:20 +0530 Subject: [PATCH 2/7] Format fix --- .../parquet/format/converter/ParquetMetadataConverter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 08f9a61a2e..bb9e58b7ad 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -558,7 +558,7 @@ private void addRowGroup( columnMetaData.getTotalSize(), columnMetaData.getFirstDataPageOffset()); if ((columnMetaData.getEncodingStats() != null - && columnMetaData.getEncodingStats().hasDictionaryPages()) + && columnMetaData.getEncodingStats().hasDictionaryPages()) || columnMetaData.hasDictionaryPage()) { metaData.setDictionary_page_offset(columnMetaData.getDictionaryPageOffset()); } From d3dc570e30fa5b4a88e632c5717af300e19901a4 Mon Sep 17 00:00:00 2001 From: RamaKrishna Mothukuri Date: Fri, 2 Aug 2024 13:17:47 +0530 Subject: [PATCH 3/7] Test case changes --- .../TestParquetMetadataConverter.java | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 2cffb51860..5c1f8bd0eb 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -206,7 +206,16 @@ public void testSchemaConverterDecimal() { @Test public void testParquetMetadataConverterWithDictionary() throws IOException { ParquetMetadata parquetMetaData = createParquetMetaData(Encoding.PLAIN_DICTIONARY, Encoding.PLAIN); + testParquetMetadataConverterWithDictionary(parquetMetaData) + } + @Test + public void testParquetMetadataConverterWithDictionaryAndWithoutEncodingStats() throws IOException { + ParquetMetadata parquetMetaData = createParquetMetaData(Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, false); + testParquetMetadataConverterWithDictionary(parquetMetaData) + } + + private void testParquetMetadataConverterWithDictionary(ParquetMetadata parquetMetaData) throws IOException { ParquetMetadataConverter converter = new ParquetMetadataConverter(); FileMetaData fmd1 = converter.toParquetMetadata(1, parquetMetaData); @@ -1283,17 +1292,24 @@ private static Statistics createStatsTyped(PrimitiveType type, BigInteger min } private static ParquetMetadata createParquetMetaData(Encoding dicEncoding, Encoding dataEncoding) { + return createParquetMetaData(dicEncoding, dataEncoding, true); + } + + private static ParquetMetadata createParquetMetaData(Encoding dicEncoding, Encoding dataEncoding, boolean includeEncodingStats) { MessageType schema = parseMessageType("message schema { optional int32 col (INT_32); }"); org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap(), null); List blockMetaDataList = new ArrayList(); BlockMetaData blockMetaData = new BlockMetaData(); - EncodingStats.Builder builder = new EncodingStats.Builder(); - if (dicEncoding != null) { - builder.addDictEncoding(dicEncoding).build(); + EncodingStats es = null; + if (includeEncodingStats) { + EncodingStats.Builder builder = new EncodingStats.Builder(); + if (dicEncoding != null) { + builder.addDictEncoding(dicEncoding).build(); + } + builder.addDataEncoding(dataEncoding); + es = builder.build(); } - builder.addDataEncoding(dataEncoding); - EncodingStats es = builder.build(); Set e = new HashSet(); PrimitiveTypeName t = PrimitiveTypeName.INT32; ColumnPath p = ColumnPath.get("col"); From 94601ade8006ecb668e7c908f66a1c8c99816286 Mon Sep 17 00:00:00 2001 From: RamaKrishna Mothukuri Date: Mon, 5 Aug 2024 16:09:21 +0530 Subject: [PATCH 4/7] Test case changes --- .../format/converter/TestParquetMetadataConverter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 5c1f8bd0eb..f6740d91db 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -206,13 +206,13 @@ public void testSchemaConverterDecimal() { @Test public void testParquetMetadataConverterWithDictionary() throws IOException { ParquetMetadata parquetMetaData = createParquetMetaData(Encoding.PLAIN_DICTIONARY, Encoding.PLAIN); - testParquetMetadataConverterWithDictionary(parquetMetaData) + testParquetMetadataConverterWithDictionary(parquetMetaData); } @Test public void testParquetMetadataConverterWithDictionaryAndWithoutEncodingStats() throws IOException { ParquetMetadata parquetMetaData = createParquetMetaData(Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, false); - testParquetMetadataConverterWithDictionary(parquetMetaData) + testParquetMetadataConverterWithDictionary(parquetMetaData); } private void testParquetMetadataConverterWithDictionary(ParquetMetadata parquetMetaData) throws IOException { From b881b805e9e92bbaa7acf83c9e8d0b0ae3983f49 Mon Sep 17 00:00:00 2001 From: RamaKrishna Mothukuri Date: Mon, 5 Aug 2024 16:41:57 +0530 Subject: [PATCH 5/7] Test case changes --- .../format/converter/TestParquetMetadataConverter.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index f6740d91db..6ddd5aa992 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -214,7 +214,7 @@ public void testParquetMetadataConverterWithDictionaryAndWithoutEncodingStats() ParquetMetadata parquetMetaData = createParquetMetaData(Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, false); testParquetMetadataConverterWithDictionary(parquetMetaData); } - + private void testParquetMetadataConverterWithDictionary(ParquetMetadata parquetMetaData) throws IOException { ParquetMetadataConverter converter = new ParquetMetadataConverter(); FileMetaData fmd1 = converter.toParquetMetadata(1, parquetMetaData); @@ -1294,8 +1294,9 @@ private static Statistics createStatsTyped(PrimitiveType type, BigInteger min private static ParquetMetadata createParquetMetaData(Encoding dicEncoding, Encoding dataEncoding) { return createParquetMetaData(dicEncoding, dataEncoding, true); } - - private static ParquetMetadata createParquetMetaData(Encoding dicEncoding, Encoding dataEncoding, boolean includeEncodingStats) { + + private static ParquetMetadata createParquetMetaData( + Encoding dicEncoding, Encoding dataEncoding, boolean includeEncodingStats) { MessageType schema = parseMessageType("message schema { optional int32 col (INT_32); }"); org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap(), null); From c7f143fbcfc9fe9f231b26cf62772ff76028d31b Mon Sep 17 00:00:00 2001 From: RamaKrishna Mothukuri Date: Mon, 5 Aug 2024 18:18:04 +0530 Subject: [PATCH 6/7] Test case changes --- .../format/converter/TestParquetMetadataConverter.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 6ddd5aa992..144f4d4581 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -1312,6 +1312,10 @@ private static ParquetMetadata createParquetMetaData( es = builder.build(); } Set e = new HashSet(); + if (dicEncoding != null) { + e.add(dicEncoding); + } + e.add(dataEncoding); PrimitiveTypeName t = PrimitiveTypeName.INT32; ColumnPath p = ColumnPath.get("col"); CompressionCodecName c = CompressionCodecName.UNCOMPRESSED; From a9bf8e59f9fed9d8bf25b3f1b39a5a427fee7ebf Mon Sep 17 00:00:00 2001 From: RamaKrishna Mothukuri Date: Mon, 5 Aug 2024 18:19:48 +0530 Subject: [PATCH 7/7] Test case changes --- .../format/converter/TestParquetMetadataConverter.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 144f4d4581..6b3259070e 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -1312,10 +1312,12 @@ private static ParquetMetadata createParquetMetaData( es = builder.build(); } Set e = new HashSet(); - if (dicEncoding != null) { - e.add(dicEncoding); + if (!includeEncodingStats) { + if (dicEncoding != null) { + e.add(dicEncoding); + } + e.add(dataEncoding); } - e.add(dataEncoding); PrimitiveTypeName t = PrimitiveTypeName.INT32; ColumnPath p = ColumnPath.get("col"); CompressionCodecName c = CompressionCodecName.UNCOMPRESSED;