From 7b9ba7f7b29f29435febc613aa19139f07968f62 Mon Sep 17 00:00:00 2001 From: Aihua Xu Date: Sun, 5 Oct 2025 20:57:16 -0700 Subject: [PATCH 1/3] Handle NPE for VariantLogicalType in TypeWithSchemaVistor --- .../parquet/TypeWithSchemaVisitor.java | 2 +- .../iceberg/parquet/TestPruneColumns.java | 63 +++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/TypeWithSchemaVisitor.java b/parquet/src/main/java/org/apache/iceberg/parquet/TypeWithSchemaVisitor.java index 4ab454829765..c5268bf51a26 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/TypeWithSchemaVisitor.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/TypeWithSchemaVisitor.java @@ -64,7 +64,7 @@ public static T visit( } else if (annotation instanceof LogicalTypeAnnotation.VariantLogicalTypeAnnotation || (iType != null && iType.isVariantType())) { // when Parquet has a VARIANT logical type, use it here - return visitVariant(iType.asVariantType(), group, visitor); + return visitVariant(iType != null ? iType.asVariantType() : null, group, visitor); } Types.StructType struct = iType != null ? iType.asStructType() : null; diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestPruneColumns.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestPruneColumns.java index 70345adf1b8b..8e0d4f691f1c 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestPruneColumns.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestPruneColumns.java @@ -20,13 +20,16 @@ import static org.assertj.core.api.Assertions.assertThat; +import java.util.List; import org.apache.iceberg.Schema; import org.apache.iceberg.types.Types.DoubleType; +import org.apache.iceberg.types.Types.IntegerType; import org.apache.iceberg.types.Types.ListType; import org.apache.iceberg.types.Types.MapType; import org.apache.iceberg.types.Types.NestedField; import org.apache.iceberg.types.Types.StringType; import org.apache.iceberg.types.Types.StructType; +import org.apache.iceberg.types.Types.VariantType; import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; @@ -270,4 +273,64 @@ public void testStructElementName() { MessageType actual = ParquetSchemaUtil.pruneColumns(fileSchema, projection); assertThat(actual).as("Pruned schema should be matched").isEqualTo(expected); } + + @Test + public void testVariant() { + MessageType fileSchema = + Types.buildMessage() + .addField( + Types.primitive(PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .id(1) + .named("id")) + .addField( + Types.buildGroup(Type.Repetition.OPTIONAL) + .as(LogicalTypeAnnotation.variantType((byte) 1)) + .addField( + Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .named("metadata")) + .addField( + Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .named("value")) + .id(2) + .named("variant_1")) + .addField( + Types.buildGroup(Type.Repetition.OPTIONAL) + .as(LogicalTypeAnnotation.variantType((byte) 1)) + .addField( + Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .named("metadata")) + .addField( + Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .named("value")) + .id(3) + .named("variant_2")) + .named("table"); + + Schema projection = + new Schema( + List.of( + NestedField.required(1, "id", IntegerType.get()), + NestedField.required(2, "variant_1", VariantType.get()))); + MessageType expected = + Types.buildMessage() + .addField( + Types.primitive(PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .id(1) + .named("id")) + .addField( + Types.buildGroup(Type.Repetition.OPTIONAL) + .as(LogicalTypeAnnotation.variantType((byte) 1)) + .addField( + Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .named("metadata")) + .addField( + Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .named("value")) + .id(2) + .named("variant_1")) + .named("table"); + + MessageType actual = ParquetSchemaUtil.pruneColumns(fileSchema, projection); + assertThat(actual).as("Pruned schema should be matched").isEqualTo(expected); + } } From bae332847ea3c588b7f0b9eb2c218d1ea04b1d4b Mon Sep 17 00:00:00 2001 From: Aihua Xu Date: Mon, 6 Oct 2025 11:13:07 -0700 Subject: [PATCH 2/3] Change to use ImmutableList --- .../java/org/apache/iceberg/parquet/TestPruneColumns.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestPruneColumns.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestPruneColumns.java index 8e0d4f691f1c..c68c907aab73 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestPruneColumns.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestPruneColumns.java @@ -20,8 +20,8 @@ import static org.assertj.core.api.Assertions.assertThat; -import java.util.List; import org.apache.iceberg.Schema; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.types.Types.DoubleType; import org.apache.iceberg.types.Types.IntegerType; import org.apache.iceberg.types.Types.ListType; @@ -308,7 +308,7 @@ public void testVariant() { Schema projection = new Schema( - List.of( + ImmutableList.of( NestedField.required(1, "id", IntegerType.get()), NestedField.required(2, "variant_1", VariantType.get()))); MessageType expected = From ee150f3ab50dccde4ec555807196b43533668504 Mon Sep 17 00:00:00 2001 From: Aihua Xu Date: Wed, 8 Oct 2025 09:57:15 -0700 Subject: [PATCH 3/3] Extract buildVariantType --- .../iceberg/parquet/TestPruneColumns.java | 48 ++++++------------- 1 file changed, 15 insertions(+), 33 deletions(-) diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestPruneColumns.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestPruneColumns.java index c68c907aab73..619b2c5a3470 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestPruneColumns.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestPruneColumns.java @@ -30,6 +30,7 @@ import org.apache.iceberg.types.Types.StringType; import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.types.Types.VariantType; +import org.apache.iceberg.variants.Variant; import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; @@ -282,28 +283,8 @@ public void testVariant() { Types.primitive(PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) .id(1) .named("id")) - .addField( - Types.buildGroup(Type.Repetition.OPTIONAL) - .as(LogicalTypeAnnotation.variantType((byte) 1)) - .addField( - Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) - .named("metadata")) - .addField( - Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) - .named("value")) - .id(2) - .named("variant_1")) - .addField( - Types.buildGroup(Type.Repetition.OPTIONAL) - .as(LogicalTypeAnnotation.variantType((byte) 1)) - .addField( - Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) - .named("metadata")) - .addField( - Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) - .named("value")) - .id(3) - .named("variant_2")) + .addField(buildVariantType(2, "variant_1")) + .addField(buildVariantType(3, "variant_2")) .named("table"); Schema projection = @@ -317,20 +298,21 @@ public void testVariant() { Types.primitive(PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) .id(1) .named("id")) - .addField( - Types.buildGroup(Type.Repetition.OPTIONAL) - .as(LogicalTypeAnnotation.variantType((byte) 1)) - .addField( - Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) - .named("metadata")) - .addField( - Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) - .named("value")) - .id(2) - .named("variant_1")) + .addField(buildVariantType(2, "variant_1")) .named("table"); MessageType actual = ParquetSchemaUtil.pruneColumns(fileSchema, projection); assertThat(actual).as("Pruned schema should be matched").isEqualTo(expected); } + + private static Type buildVariantType(int id, String name) { + return Types.buildGroup(Type.Repetition.OPTIONAL) + .as(LogicalTypeAnnotation.variantType(Variant.VARIANT_SPEC_VERSION)) + .addField( + Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED).named("metadata")) + .addField( + Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED).named("value")) + .id(id) + .named(name); + } }