From 1d973da92c9acf0980e5b765a8cdcf83d040448d Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 12 Nov 2025 23:00:56 +0000 Subject: [PATCH 01/33] feat!: Support compression codecs for JSON metadata and Avro Previously these properties where not honored on tabel properties. - Adds table properties for these values. - Plumbs them through for writers. --- Cargo.lock | 6 +- crates/iceberg/Cargo.toml | 2 + crates/iceberg/src/spec/avro_util.rs | 243 ++++++++++++++++++ crates/iceberg/src/spec/manifest/writer.rs | 42 ++- .../iceberg/src/spec/manifest_list/writer.rs | 41 ++- crates/iceberg/src/spec/mod.rs | 1 + crates/iceberg/src/spec/table_properties.rs | 53 +++- crates/iceberg/src/transaction/snapshot.rs | 31 ++- 8 files changed, 407 insertions(+), 12 deletions(-) create mode 100644 crates/iceberg/src/spec/avro_util.rs diff --git a/Cargo.lock b/Cargo.lock index c7c5f6680d..e1a8d958db 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3770,7 +3770,9 @@ dependencies = [ "futures", "iceberg_test_utils", "itertools 0.13.0", + "log", "minijinja", + "miniz_oxide", "mockall", "moka", "murmur3", @@ -6020,9 +6022,9 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.15" +version = "0.11.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fcb935c5bec503c2f0e306bdd3e58bb9029dcb14fa8d9ac76e3a5256ac0763e" +checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" dependencies = [ "aws-lc-rs", "bytes", diff --git a/crates/iceberg/Cargo.toml b/crates/iceberg/Cargo.toml index 9353a31842..afb961d9dc 100644 --- a/crates/iceberg/Cargo.toml +++ b/crates/iceberg/Cargo.toml @@ -59,6 +59,8 @@ flate2 = { workspace = true } fnv = { workspace = true } futures = { workspace = true } itertools = { workspace = true } +log = { workspace = true } +miniz_oxide = "0.8" moka = { version = "0.12.10", features = ["future"] } murmur3 = { workspace = true } once_cell = { workspace = true } diff --git a/crates/iceberg/src/spec/avro_util.rs b/crates/iceberg/src/spec/avro_util.rs new file mode 100644 index 0000000000..36d702c832 --- /dev/null +++ b/crates/iceberg/src/spec/avro_util.rs @@ -0,0 +1,243 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Utilities for working with Apache Avro in Iceberg. + +use apache_avro::Codec; +use log::warn; + +/// Convert codec name and level to apache_avro::Codec. +/// Returns Codec::Null for unknown or unsupported codecs. +/// +/// # Arguments +/// +/// * `codec` - The name of the compression codec (e.g., "gzip", "zstd", "deflate", "none") +/// * `level` - The compression level. For deflate/gzip: +/// - 0: NoCompression +/// - 1: BestSpeed +/// - 9: BestCompression +/// - 10: UberCompression +/// - Other values: DefaultLevel (6) +/// +/// # Supported Codecs +/// +/// - `gzip` or `deflate`: Uses Deflate compression with specified level +/// - `zstd`: Uses Zstandard compression (level clamped to valid zstd range) +/// - `none` or `None`: No compression +/// - Any other value: Defaults to no compression (Codec::Null) +/// +/// # Compression Levels +/// +/// The compression level mapping is based on miniz_oxide's CompressionLevel enum: +/// - Level 0: No compression +/// - Level 1: Best speed (fastest) +/// - Level 9: Best compression (slower, better compression) +/// - Level 10: Uber compression (slowest, best compression) +/// - Other: Default level (balanced speed/compression) +pub(crate) fn codec_from_str(codec: Option<&str>, level: u8) -> Codec { + use apache_avro::{DeflateSettings, ZstandardSettings}; + + match codec { + Some("gzip") | Some("deflate") => { + // Map compression level to miniz_oxide::deflate::CompressionLevel + // Reference: https://docs.rs/miniz_oxide/latest/miniz_oxide/deflate/enum.CompressionLevel.html + use miniz_oxide::deflate::CompressionLevel; + + let compression_level = match level { + 0 => CompressionLevel::NoCompression, + 1 => CompressionLevel::BestSpeed, + 9 => CompressionLevel::BestCompression, + 10 => CompressionLevel::UberCompression, + _ => CompressionLevel::DefaultLevel, + }; + + Codec::Deflate(DeflateSettings::new(compression_level)) + } + Some("zstd") => { + // Zstandard supports levels 0-22, clamp to valid range + let zstd_level = level.min(22); + Codec::Zstandard(ZstandardSettings::new(zstd_level)) + } + Some("none") | None => Codec::Null, + Some(unknown) => { + warn!( + "Unrecognized compression codec '{}', using no compression (Codec::Null)", + unknown + ); + Codec::Null + } + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use apache_avro::types::Record; + use apache_avro::{Schema, Writer}; + + use super::*; + + #[test] + fn test_codec_from_str_gzip() { + let codec = codec_from_str(Some("gzip"), 5); + assert!(matches!(codec, Codec::Deflate(_))); + } + + #[test] + fn test_codec_from_str_deflate() { + let codec = codec_from_str(Some("deflate"), 9); + assert!(matches!(codec, Codec::Deflate(_))); + } + + #[test] + fn test_codec_from_str_zstd() { + let codec = codec_from_str(Some("zstd"), 3); + assert!(matches!(codec, Codec::Zstandard(_))); + } + + #[test] + fn test_codec_from_str_none() { + let codec = codec_from_str(Some("none"), 0); + assert!(matches!(codec, Codec::Null)); + } + + #[test] + fn test_codec_from_str_null() { + let codec = codec_from_str(None, 0); + assert!(matches!(codec, Codec::Null)); + } + + #[test] + fn test_codec_from_str_unknown() { + let codec = codec_from_str(Some("unknown"), 1); + assert!(matches!(codec, Codec::Null)); + } + + #[test] + fn test_codec_from_str_deflate_levels() { + // Create a simple schema for testing + let schema = Schema::parse_str(r#"{"type": "record", "name": "test", "fields": [{"name": "field", "type": "string"}]}"#).unwrap(); + + // Create test data + let test_str = "test data that should compress differently at different levels. This is a longer string to ensure compression has something to work with. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog."; + + // Test that different compression levels produce different output sizes + let mut sizes = HashMap::new(); + for level in [0, 1, 5, 9, 10] { + let codec = codec_from_str(Some("gzip"), level); + let mut writer = Writer::with_codec(&schema, Vec::new(), codec); + + let mut record = Record::new(&schema).unwrap(); + record.put("field", test_str); + writer.append(record).unwrap(); + + let encoded = writer.into_inner().unwrap(); + sizes.insert(level, encoded.len()); + } + + // Level 0 (NoCompression) should be largest + // Level 10 (UberCompression) should be smallest or equal to level 9 + assert!(sizes[&0] >= sizes[&1], "Level 0 should be >= level 1"); + assert!( + sizes[&1] >= sizes[&9] || sizes[&1] == sizes[&9], + "Level 1 should be >= level 9" + ); + assert!( + sizes[&9] >= sizes[&10] || sizes[&9] == sizes[&10], + "Level 9 should be >= level 10" + ); + } + + #[test] + fn test_codec_from_str_zstd_levels() { + // Create a simple schema for testing + let schema = Schema::parse_str(r#"{"type": "record", "name": "test", "fields": [{"name": "field", "type": "string"}]}"#).unwrap(); + let test_str = "test data that should compress differently at different levels. This is a longer string to ensure compression has something to work with. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog."; + + // Test various levels by checking they produce valid codecs + for level in [0, 3, 15, 22] { + let codec = codec_from_str(Some("zstd"), level); + assert!(matches!(codec, Codec::Zstandard(_))); + + // Verify the codec actually works by compressing data + let mut writer = Writer::with_codec(&schema, Vec::new(), codec); + let mut record = Record::new(&schema).unwrap(); + record.put("field", test_str); + writer.append(record).unwrap(); + + let encoded = writer.into_inner().unwrap(); + assert!(encoded.len() > 0, "Compression should produce output"); + } + + // Test clamping - higher than 22 should be clamped to 22 + let codec_100 = codec_from_str(Some("zstd"), 100); + let codec_22 = codec_from_str(Some("zstd"), 22); + + // Both should work and produce similar results + let mut writer_100 = Writer::with_codec(&schema, Vec::new(), codec_100); + let mut record_100 = Record::new(&schema).unwrap(); + record_100.put("field", test_str); + writer_100.append(record_100).unwrap(); + let encoded_100 = writer_100.into_inner().unwrap(); + + let mut writer_22 = Writer::with_codec(&schema, Vec::new(), codec_22); + let mut record_22 = Record::new(&schema).unwrap(); + record_22.put("field", test_str); + writer_22.append(record_22).unwrap(); + let encoded_22 = writer_22.into_inner().unwrap(); + + // Both should produce the same size since 100 is clamped to 22 + assert_eq!( + encoded_100.len(), + encoded_22.len(), + "Level 100 should be clamped to 22" + ); + } + + #[test] + fn test_compression_level_differences() { + // Create a schema and data that will compress well + let schema = Schema::parse_str(r#"{"type": "record", "name": "test", "fields": [{"name": "field", "type": "string"}]}"#).unwrap(); + + // Use highly compressible data + let test_str = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + + // Test gzip level 0 (no compression) vs level 9 (best compression) + let codec_0 = codec_from_str(Some("gzip"), 0); + let mut writer_0 = Writer::with_codec(&schema, Vec::new(), codec_0); + let mut record_0 = Record::new(&schema).unwrap(); + record_0.put("field", test_str); + writer_0.append(record_0).unwrap(); + let size_0 = writer_0.into_inner().unwrap().len(); + + let codec_9 = codec_from_str(Some("gzip"), 9); + let mut writer_9 = Writer::with_codec(&schema, Vec::new(), codec_9); + let mut record_9 = Record::new(&schema).unwrap(); + record_9.put("field", test_str); + writer_9.append(record_9).unwrap(); + let size_9 = writer_9.into_inner().unwrap().len(); + + // Level 0 should produce larger output than level 9 for compressible data + assert!( + size_0 > size_9, + "NoCompression (level 0) should produce larger output than BestCompression (level 9): {} vs {}", + size_0, + size_9 + ); + } +} diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index 88da1a396c..f3e92b30f8 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -31,11 +31,12 @@ use super::{ use crate::encryption::EncryptedOutputFile; use crate::error::Result; use crate::io::{FileWrite, OutputFile}; +use crate::spec::avro_util::codec_from_str; use crate::spec::manifest::_serde::{ManifestEntryV1, ManifestEntryV2}; use crate::spec::manifest::{manifest_schema_v1, manifest_schema_v2}; use crate::spec::{ DataContentType, DataFile, FieldSummary, ManifestEntry, ManifestFile, ManifestMetadata, - ManifestStatus, PrimitiveLiteral, SchemaRef, StructType, + ManifestStatus, PrimitiveLiteral, SchemaRef, StructType, TableProperties, }; use crate::{Error, ErrorKind}; @@ -53,6 +54,8 @@ pub struct ManifestWriterBuilder { key_metadata: Option>, schema: SchemaRef, partition_spec: PartitionSpec, + compression_codec: String, + compression_level: u8, } impl ManifestWriterBuilder { @@ -72,6 +75,8 @@ impl ManifestWriterBuilder { key_metadata, schema, partition_spec, + compression_codec: TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), + compression_level: TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT, } } @@ -93,9 +98,18 @@ impl ManifestWriterBuilder { key_metadata, schema, partition_spec, + compression_codec: TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), + compression_level: TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT, } } + /// Set compression codec and level for the manifest file. + pub fn with_compression(mut self, codec: String, level: u8) -> Self { + self.compression_codec = codec; + self.compression_level = level; + self + } + /// Build a [`ManifestWriter`] for format version 1. pub fn build_v1(self) -> ManifestWriter { let metadata = ManifestMetadata::builder() @@ -112,6 +126,8 @@ impl ManifestWriterBuilder { self.key_metadata, metadata, None, + self.compression_codec, + self.compression_level, ) } @@ -131,6 +147,8 @@ impl ManifestWriterBuilder { self.key_metadata, metadata, None, + self.compression_codec, + self.compression_level, ) } @@ -150,6 +168,8 @@ impl ManifestWriterBuilder { self.key_metadata, metadata, None, + self.compression_codec, + self.compression_level, ) } @@ -171,6 +191,8 @@ impl ManifestWriterBuilder { // First row id is assigned by the [`ManifestListWriter`] when the manifest // is added to the list. None, + self.compression_codec, + self.compression_level, ) } @@ -190,6 +212,8 @@ impl ManifestWriterBuilder { self.key_metadata, metadata, None, + self.compression_codec, + self.compression_level, ) } } @@ -216,6 +240,9 @@ pub struct ManifestWriter { manifest_entries: Vec, metadata: ManifestMetadata, + + compression_codec: String, + compression_level: u8, } impl ManifestWriter { @@ -227,6 +254,8 @@ impl ManifestWriter { key_metadata: Option>, metadata: ManifestMetadata, first_row_id: Option, + compression_codec: String, + compression_level: u8, ) -> Self { Self { writer_future, @@ -243,6 +272,8 @@ impl ManifestWriter { key_metadata, manifest_entries: Vec::new(), metadata, + compression_codec, + compression_level, } } @@ -451,7 +482,14 @@ impl ManifestWriter { // Manifest schema did not change between V2 and V3 FormatVersion::V2 | FormatVersion::V3 => manifest_schema_v2(&partition_type)?, }; - let mut avro_writer = AvroWriter::new(&avro_schema, Vec::new()); + + // Determine compression codec using helper function + let codec = codec_from_str( + Some(self.compression_codec.as_str()), + self.compression_level, + ); + + let mut avro_writer = AvroWriter::with_codec(&avro_schema, Vec::new(), codec); avro_writer.add_user_metadata( "schema".to_string(), to_vec(table_schema).map_err(|err| { diff --git a/crates/iceberg/src/spec/manifest_list/writer.rs b/crates/iceberg/src/spec/manifest_list/writer.rs index f739870171..8fe19fdabd 100644 --- a/crates/iceberg/src/spec/manifest_list/writer.rs +++ b/crates/iceberg/src/spec/manifest_list/writer.rs @@ -27,6 +27,8 @@ use super::_serde::{ManifestFileV1, ManifestFileV2, ManifestFileV3}; use super::{FormatVersion, ManifestContentType, ManifestFile, UNASSIGNED_SEQUENCE_NUMBER}; use crate::error::Result; use crate::io::FileWrite; +use crate::spec::TableProperties; +use crate::spec::avro_util::codec_from_str; use crate::{Error, ErrorKind}; /// A manifest list writer. @@ -37,6 +39,8 @@ pub struct ManifestListWriter { sequence_number: i64, snapshot_id: i64, next_row_id: Option, + compression_codec: String, + compression_level: u8, } impl std::fmt::Debug for ManifestListWriter { @@ -54,6 +58,21 @@ impl ManifestListWriter { self.next_row_id } + /// Set compression codec and level for the manifest list file. + pub fn with_compression(mut self, codec: String, level: u8) -> Self { + self.compression_codec = codec.clone(); + self.compression_level = level; + + let avro_schema = match self.format_version { + FormatVersion::V1 => &MANIFEST_LIST_AVRO_SCHEMA_V1, + FormatVersion::V2 => &MANIFEST_LIST_AVRO_SCHEMA_V2, + FormatVersion::V3 => &MANIFEST_LIST_AVRO_SCHEMA_V3, + }; + let compression = codec_from_str(Some(codec.as_str()), level); + self.avro_writer = Writer::with_codec(avro_schema, Vec::new(), compression); + self + } + /// Construct a v1 [`ManifestListWriter`] that writes to a provided [`FileWrite`]. pub fn v1( writer: Box, @@ -70,7 +89,16 @@ impl ManifestListWriter { parent_snapshot_id.to_string(), ); } - Self::new(FormatVersion::V1, writer, metadata, 0, snapshot_id, None) + Self::new( + FormatVersion::V1, + writer, + metadata, + 0, + snapshot_id, + None, + TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), + TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT, + ) } /// Construct a v2 [`ManifestListWriter`] that writes to a provided [`FileWrite`]. @@ -98,6 +126,8 @@ impl ManifestListWriter { sequence_number, snapshot_id, None, + TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), + TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT, ) } @@ -133,6 +163,8 @@ impl ManifestListWriter { sequence_number, snapshot_id, first_row_id, + TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), + TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT, ) } @@ -143,13 +175,16 @@ impl ManifestListWriter { sequence_number: i64, snapshot_id: i64, first_row_id: Option, + compression_codec: String, + compression_level: u8, ) -> Self { let avro_schema = match format_version { FormatVersion::V1 => &MANIFEST_LIST_AVRO_SCHEMA_V1, FormatVersion::V2 => &MANIFEST_LIST_AVRO_SCHEMA_V2, FormatVersion::V3 => &MANIFEST_LIST_AVRO_SCHEMA_V3, }; - let mut avro_writer = Writer::new(avro_schema, Vec::new()); + let codec = codec_from_str(Some(compression_codec.as_str()), compression_level); + let mut avro_writer = Writer::with_codec(avro_schema, Vec::new(), codec); for (key, value) in metadata { avro_writer .add_user_metadata(key, value) @@ -162,6 +197,8 @@ impl ManifestListWriter { sequence_number, snapshot_id, next_row_id: first_row_id, + compression_codec, + compression_level, } } diff --git a/crates/iceberg/src/spec/mod.rs b/crates/iceberg/src/spec/mod.rs index b23ca1eda0..8822dae64f 100644 --- a/crates/iceberg/src/spec/mod.rs +++ b/crates/iceberg/src/spec/mod.rs @@ -17,6 +17,7 @@ //! Spec for Iceberg. +mod avro_util; mod datatypes; mod encrypted_key; mod manifest; diff --git a/crates/iceberg/src/spec/table_properties.rs b/crates/iceberg/src/spec/table_properties.rs index 444c9ac4ed..8ae668ccbc 100644 --- a/crates/iceberg/src/spec/table_properties.rs +++ b/crates/iceberg/src/spec/table_properties.rs @@ -114,6 +114,10 @@ pub struct TableProperties { pub write_target_file_size_bytes: usize, /// Compression codec for metadata files (JSON) pub metadata_compression_codec: CompressionCodec, + /// Compression codec for Avro files (manifests, manifest lists) + pub avro_compression_codec: String, + /// Compression level for Avro files + pub avro_compression_level: u8, /// Whether to use `FanoutWriter` for partitioned tables. pub write_datafusion_fanout_enabled: bool, /// Whether garbage collection is enabled on drop. @@ -232,6 +236,17 @@ impl TableProperties { pub const PROPERTY_METADATA_COMPRESSION_CODEC: &str = "write.metadata.compression-codec"; /// Default metadata compression codec - uncompressed pub const PROPERTY_METADATA_COMPRESSION_CODEC_DEFAULT: &str = "none"; + + /// Compression codec for Avro files (manifests, manifest lists) + pub const PROPERTY_AVRO_COMPRESSION_CODEC: &str = "write.avro.compression-codec"; + /// Default Avro compression codec - gzip + pub const PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT: &str = "gzip"; + + /// Compression level for Avro files + pub const PROPERTY_AVRO_COMPRESSION_LEVEL: &str = "write.avro.compression-level"; + /// Default Avro compression level (9 = BestCompression) + pub const PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT: u8 = 9; + /// Whether to use `FanoutWriter` for partitioned tables (handles unsorted data). /// If false, uses `ClusteredWriter` (requires sorted data, more memory efficient). pub const PROPERTY_DATAFUSION_WRITE_FANOUT_ENABLED: &str = "write.datafusion.fanout.enabled"; @@ -325,6 +340,16 @@ impl TryFrom<&HashMap> for TableProperties { TableProperties::PROPERTY_WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT, )?, metadata_compression_codec: parse_metadata_file_compression(props)?, + avro_compression_codec: parse_property( + props, + TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC, + TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), + )?, + avro_compression_level: parse_property( + props, + TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL, + TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT, + )?, write_datafusion_fanout_enabled: parse_property( props, TableProperties::PROPERTY_DATAFUSION_WRITE_FANOUT_ENABLED, @@ -416,6 +441,14 @@ mod tests { table_properties.metadata_compression_codec, CompressionCodec::None ); + assert_eq!( + table_properties.avro_compression_codec, + TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string() + ); + assert_eq!( + table_properties.avro_compression_level, + TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT + ); assert_eq!( table_properties.gc_enabled, TableProperties::PROPERTY_GC_ENABLED_DEFAULT @@ -458,15 +491,27 @@ mod tests { #[test] fn test_table_properties_compression() { - let props = HashMap::from([( - TableProperties::PROPERTY_METADATA_COMPRESSION_CODEC.to_string(), - "gzip".to_string(), - )]); + let props = HashMap::from([ + ( + TableProperties::PROPERTY_METADATA_COMPRESSION_CODEC.to_string(), + "gzip".to_string(), + ), + ( + TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC.to_string(), + "zstd".to_string(), + ), + ( + TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL.to_string(), + "3".to_string(), + ), + ]); let table_properties = TableProperties::try_from(&props).unwrap(); assert_eq!( table_properties.metadata_compression_codec, CompressionCodec::gzip_default() ); + assert_eq!(table_properties.avro_compression_codec, "zstd"); + assert_eq!(table_properties.avro_compression_level, 3); } #[test] diff --git a/crates/iceberg/src/transaction/snapshot.rs b/crates/iceberg/src/transaction/snapshot.rs index 8e47226072..23b035d88c 100644 --- a/crates/iceberg/src/transaction/snapshot.rs +++ b/crates/iceberg/src/transaction/snapshot.rs @@ -252,6 +252,19 @@ impl<'a> SnapshotProducer<'a> { DataFileFormat::Avro ); let output_file = self.table.file_io().new_output(new_manifest_path)?; + + // Get compression settings from table properties + let table_props = + TableProperties::try_from(self.table.metadata().properties()).map_err(|e| { + Error::new( + ErrorKind::DataInvalid, + "Failed to parse table properties for compression settings", + ) + .with_source(e) + })?; + let codec = table_props.avro_compression_codec.clone(); + let level = table_props.avro_compression_level; + let builder = ManifestWriterBuilder::new( output_file, Some(self.snapshot_id), @@ -262,7 +275,9 @@ impl<'a> SnapshotProducer<'a> { .default_partition_spec() .as_ref() .clone(), - ); + ) + .with_compression(codec, level); + match self.table.metadata().format_version() { FormatVersion::V1 => Ok(builder.build_v1()), FormatVersion::V2 => match content { @@ -446,6 +461,17 @@ impl<'a> SnapshotProducer<'a> { .new_output(manifest_list_path.clone())? .writer() .await?; + let table_props = + TableProperties::try_from(self.table.metadata().properties()).map_err(|e| { + Error::new( + ErrorKind::DataInvalid, + "Failed to parse table properties for compression settings", + ) + .with_source(e) + })?; + let codec = table_props.avro_compression_codec.clone(); + let level = table_props.avro_compression_level; + let mut manifest_list_writer = match self.table.metadata().format_version() { FormatVersion::V1 => ManifestListWriter::v1( writer, @@ -465,7 +491,8 @@ impl<'a> SnapshotProducer<'a> { next_seq_num, Some(first_row_id), ), - }; + } + .with_compression(codec, level); // Calling self.summary() before self.manifest_file() is important because self.added_data_files // will be set to an empty vec after self.manifest_file() returns, resulting in an empty summary From 7e0277495290b4d33c292ffbae4d486d758436be Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 13 Nov 2025 06:54:57 +0000 Subject: [PATCH 02/33] fix clippy --- crates/iceberg/src/spec/avro_util.rs | 32 ++++++++++++ crates/iceberg/src/spec/manifest/writer.rs | 51 +++++++------------ .../iceberg/src/spec/manifest_list/writer.rs | 35 +++++-------- crates/iceberg/src/spec/mod.rs | 1 + crates/iceberg/src/transaction/snapshot.rs | 24 +++++---- 5 files changed, 78 insertions(+), 65 deletions(-) diff --git a/crates/iceberg/src/spec/avro_util.rs b/crates/iceberg/src/spec/avro_util.rs index 36d702c832..2c4ac6e98d 100644 --- a/crates/iceberg/src/spec/avro_util.rs +++ b/crates/iceberg/src/spec/avro_util.rs @@ -20,6 +20,38 @@ use apache_avro::Codec; use log::warn; +use crate::spec::TableProperties; + +/// Settings for compression codec and level. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CompressionSettings { + /// The compression codec name (e.g., "gzip", "zstd", "deflate", "none") + pub codec: String, + /// The compression level + pub level: u8, +} + +impl CompressionSettings { + /// Create a new CompressionSettings with the specified codec and level. + pub fn new(codec: String, level: u8) -> Self { + Self { codec, level } + } + + /// Convert to apache_avro::Codec using the codec_from_str helper function. + pub(crate) fn to_codec(&self) -> Codec { + codec_from_str(Some(&self.codec), self.level) + } +} + +impl Default for CompressionSettings { + fn default() -> Self { + Self { + codec: TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), + level: TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT, + } + } +} + /// Convert codec name and level to apache_avro::Codec. /// Returns Codec::Null for unknown or unsupported codecs. /// diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index f3e92b30f8..656e1985d1 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -31,12 +31,12 @@ use super::{ use crate::encryption::EncryptedOutputFile; use crate::error::Result; use crate::io::{FileWrite, OutputFile}; -use crate::spec::avro_util::codec_from_str; +use crate::spec::avro_util::CompressionSettings; use crate::spec::manifest::_serde::{ManifestEntryV1, ManifestEntryV2}; use crate::spec::manifest::{manifest_schema_v1, manifest_schema_v2}; use crate::spec::{ DataContentType, DataFile, FieldSummary, ManifestEntry, ManifestFile, ManifestMetadata, - ManifestStatus, PrimitiveLiteral, SchemaRef, StructType, TableProperties, + ManifestStatus, PrimitiveLiteral, SchemaRef, StructType, }; use crate::{Error, ErrorKind}; @@ -54,8 +54,7 @@ pub struct ManifestWriterBuilder { key_metadata: Option>, schema: SchemaRef, partition_spec: PartitionSpec, - compression_codec: String, - compression_level: u8, + compression: CompressionSettings, } impl ManifestWriterBuilder { @@ -75,8 +74,7 @@ impl ManifestWriterBuilder { key_metadata, schema, partition_spec, - compression_codec: TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), - compression_level: TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT, + compression: CompressionSettings::default(), } } @@ -98,15 +96,13 @@ impl ManifestWriterBuilder { key_metadata, schema, partition_spec, - compression_codec: TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), - compression_level: TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT, + compression: CompressionSettings::default(), } } - /// Set compression codec and level for the manifest file. - pub fn with_compression(mut self, codec: String, level: u8) -> Self { - self.compression_codec = codec; - self.compression_level = level; + /// Set compression settings for the manifest file. + pub fn with_compression(mut self, compression: CompressionSettings) -> Self { + self.compression = compression; self } @@ -126,8 +122,7 @@ impl ManifestWriterBuilder { self.key_metadata, metadata, None, - self.compression_codec, - self.compression_level, + self.compression, ) } @@ -147,8 +142,7 @@ impl ManifestWriterBuilder { self.key_metadata, metadata, None, - self.compression_codec, - self.compression_level, + self.compression, ) } @@ -168,8 +162,7 @@ impl ManifestWriterBuilder { self.key_metadata, metadata, None, - self.compression_codec, - self.compression_level, + self.compression, ) } @@ -191,8 +184,7 @@ impl ManifestWriterBuilder { // First row id is assigned by the [`ManifestListWriter`] when the manifest // is added to the list. None, - self.compression_codec, - self.compression_level, + self.compression, ) } @@ -212,8 +204,7 @@ impl ManifestWriterBuilder { self.key_metadata, metadata, None, - self.compression_codec, - self.compression_level, + self.compression, ) } } @@ -241,8 +232,7 @@ pub struct ManifestWriter { metadata: ManifestMetadata, - compression_codec: String, - compression_level: u8, + compression: CompressionSettings, } impl ManifestWriter { @@ -254,8 +244,7 @@ impl ManifestWriter { key_metadata: Option>, metadata: ManifestMetadata, first_row_id: Option, - compression_codec: String, - compression_level: u8, + compression: CompressionSettings, ) -> Self { Self { writer_future, @@ -272,8 +261,7 @@ impl ManifestWriter { key_metadata, manifest_entries: Vec::new(), metadata, - compression_codec, - compression_level, + compression, } } @@ -483,11 +471,8 @@ impl ManifestWriter { FormatVersion::V2 | FormatVersion::V3 => manifest_schema_v2(&partition_type)?, }; - // Determine compression codec using helper function - let codec = codec_from_str( - Some(self.compression_codec.as_str()), - self.compression_level, - ); + // Determine compression codec using CompressionSettings + let codec = self.compression.to_codec(); let mut avro_writer = AvroWriter::with_codec(&avro_schema, Vec::new(), codec); avro_writer.add_user_metadata( diff --git a/crates/iceberg/src/spec/manifest_list/writer.rs b/crates/iceberg/src/spec/manifest_list/writer.rs index 8fe19fdabd..e1c6efa3f6 100644 --- a/crates/iceberg/src/spec/manifest_list/writer.rs +++ b/crates/iceberg/src/spec/manifest_list/writer.rs @@ -27,8 +27,7 @@ use super::_serde::{ManifestFileV1, ManifestFileV2, ManifestFileV3}; use super::{FormatVersion, ManifestContentType, ManifestFile, UNASSIGNED_SEQUENCE_NUMBER}; use crate::error::Result; use crate::io::FileWrite; -use crate::spec::TableProperties; -use crate::spec::avro_util::codec_from_str; +use crate::spec::avro_util::CompressionSettings; use crate::{Error, ErrorKind}; /// A manifest list writer. @@ -39,8 +38,7 @@ pub struct ManifestListWriter { sequence_number: i64, snapshot_id: i64, next_row_id: Option, - compression_codec: String, - compression_level: u8, + compression: CompressionSettings, } impl std::fmt::Debug for ManifestListWriter { @@ -58,18 +56,16 @@ impl ManifestListWriter { self.next_row_id } - /// Set compression codec and level for the manifest list file. - pub fn with_compression(mut self, codec: String, level: u8) -> Self { - self.compression_codec = codec.clone(); - self.compression_level = level; - + /// Set compression settings for the manifest list file. + pub fn with_compression(mut self, compression: CompressionSettings) -> Self { let avro_schema = match self.format_version { FormatVersion::V1 => &MANIFEST_LIST_AVRO_SCHEMA_V1, FormatVersion::V2 => &MANIFEST_LIST_AVRO_SCHEMA_V2, FormatVersion::V3 => &MANIFEST_LIST_AVRO_SCHEMA_V3, }; - let compression = codec_from_str(Some(codec.as_str()), level); - self.avro_writer = Writer::with_codec(avro_schema, Vec::new(), compression); + let codec = compression.to_codec(); + self.avro_writer = Writer::with_codec(avro_schema, Vec::new(), codec); + self.compression = compression; self } @@ -96,8 +92,7 @@ impl ManifestListWriter { 0, snapshot_id, None, - TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), - TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT, + CompressionSettings::default(), ) } @@ -126,8 +121,7 @@ impl ManifestListWriter { sequence_number, snapshot_id, None, - TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), - TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT, + CompressionSettings::default(), ) } @@ -163,8 +157,7 @@ impl ManifestListWriter { sequence_number, snapshot_id, first_row_id, - TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), - TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT, + CompressionSettings::default(), ) } @@ -175,15 +168,14 @@ impl ManifestListWriter { sequence_number: i64, snapshot_id: i64, first_row_id: Option, - compression_codec: String, - compression_level: u8, + compression: CompressionSettings, ) -> Self { let avro_schema = match format_version { FormatVersion::V1 => &MANIFEST_LIST_AVRO_SCHEMA_V1, FormatVersion::V2 => &MANIFEST_LIST_AVRO_SCHEMA_V2, FormatVersion::V3 => &MANIFEST_LIST_AVRO_SCHEMA_V3, }; - let codec = codec_from_str(Some(compression_codec.as_str()), compression_level); + let codec = compression.to_codec(); let mut avro_writer = Writer::with_codec(avro_schema, Vec::new(), codec); for (key, value) in metadata { avro_writer @@ -197,8 +189,7 @@ impl ManifestListWriter { sequence_number, snapshot_id, next_row_id: first_row_id, - compression_codec, - compression_level, + compression, } } diff --git a/crates/iceberg/src/spec/mod.rs b/crates/iceberg/src/spec/mod.rs index 8822dae64f..60ad8bef76 100644 --- a/crates/iceberg/src/spec/mod.rs +++ b/crates/iceberg/src/spec/mod.rs @@ -38,6 +38,7 @@ mod view_metadata; mod view_metadata_builder; mod view_version; +pub use avro_util::CompressionSettings; pub use datatypes::*; pub use encrypted_key::*; pub use manifest::*; diff --git a/crates/iceberg/src/transaction/snapshot.rs b/crates/iceberg/src/transaction/snapshot.rs index 23b035d88c..3fa7fc4f06 100644 --- a/crates/iceberg/src/transaction/snapshot.rs +++ b/crates/iceberg/src/transaction/snapshot.rs @@ -25,10 +25,10 @@ use uuid::Uuid; use crate::error::Result; use crate::spec::{ - DataFile, DataFileFormat, FormatVersion, MAIN_BRANCH, ManifestContentType, ManifestEntry, - ManifestFile, ManifestListWriter, ManifestWriter, ManifestWriterBuilder, Operation, Snapshot, - SnapshotReference, SnapshotRetention, SnapshotSummaryCollector, Struct, StructType, Summary, - TableProperties, update_snapshot_summaries, + CompressionSettings, DataFile, DataFileFormat, FormatVersion, MAIN_BRANCH, ManifestContentType, + ManifestEntry, ManifestFile, ManifestListWriter, ManifestWriter, ManifestWriterBuilder, + Operation, Snapshot, SnapshotReference, SnapshotRetention, SnapshotSummaryCollector, Struct, + StructType, Summary, TableProperties, update_snapshot_summaries, }; use crate::table::Table; use crate::transaction::ActionCommit; @@ -262,8 +262,10 @@ impl<'a> SnapshotProducer<'a> { ) .with_source(e) })?; - let codec = table_props.avro_compression_codec.clone(); - let level = table_props.avro_compression_level; + let compression = CompressionSettings::new( + table_props.avro_compression_codec.clone(), + table_props.avro_compression_level, + ); let builder = ManifestWriterBuilder::new( output_file, @@ -276,7 +278,7 @@ impl<'a> SnapshotProducer<'a> { .as_ref() .clone(), ) - .with_compression(codec, level); + .with_compression(compression); match self.table.metadata().format_version() { FormatVersion::V1 => Ok(builder.build_v1()), @@ -469,8 +471,10 @@ impl<'a> SnapshotProducer<'a> { ) .with_source(e) })?; - let codec = table_props.avro_compression_codec.clone(); - let level = table_props.avro_compression_level; + let compression = CompressionSettings::new( + table_props.avro_compression_codec.clone(), + table_props.avro_compression_level, + ); let mut manifest_list_writer = match self.table.metadata().format_version() { FormatVersion::V1 => ManifestListWriter::v1( @@ -492,7 +496,7 @@ impl<'a> SnapshotProducer<'a> { Some(first_row_id), ), } - .with_compression(codec, level); + .with_compression(compression); // Calling self.summary() before self.manifest_file() is important because self.added_data_files // will be set to an empty vec after self.manifest_file() returns, resulting in an empty summary From 83973083559fd925bc96f313dee5e1d9a8c651e9 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 13 Nov 2025 07:22:03 +0000 Subject: [PATCH 03/33] clippy again --- crates/iceberg/src/spec/avro_util.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iceberg/src/spec/avro_util.rs b/crates/iceberg/src/spec/avro_util.rs index 2c4ac6e98d..9de237be0d 100644 --- a/crates/iceberg/src/spec/avro_util.rs +++ b/crates/iceberg/src/spec/avro_util.rs @@ -213,7 +213,7 @@ mod tests { writer.append(record).unwrap(); let encoded = writer.into_inner().unwrap(); - assert!(encoded.len() > 0, "Compression should produce output"); + assert!(!encoded.is_empty(), "Compression should produce output"); } // Test clamping - higher than 22 should be clamped to 22 From 487a7fd2ebb6bfc90d8ea53731e48da747df9c91 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 16 Nov 2025 00:27:42 +0000 Subject: [PATCH 04/33] wip --- crates/iceberg/src/spec/avro_util.rs | 84 +++++++++++++-------- crates/iceberg/src/spec/manifest/writer.rs | 9 +-- crates/iceberg/src/spec/table_properties.rs | 57 +++++++++++--- crates/iceberg/src/transaction/snapshot.rs | 4 +- 4 files changed, 105 insertions(+), 49 deletions(-) diff --git a/crates/iceberg/src/spec/avro_util.rs b/crates/iceberg/src/spec/avro_util.rs index 9de237be0d..7371a46c85 100644 --- a/crates/iceberg/src/spec/avro_util.rs +++ b/crates/iceberg/src/spec/avro_util.rs @@ -25,15 +25,15 @@ use crate::spec::TableProperties; /// Settings for compression codec and level. #[derive(Debug, Clone, PartialEq, Eq)] pub struct CompressionSettings { - /// The compression codec name (e.g., "gzip", "zstd", "deflate", "none") + /// The compression codec name (e.g., "gzip", "zstd", "deflate", "uncompressed") pub codec: String, - /// The compression level - pub level: u8, + /// The compression level (None uses codec-specific defaults: gzip=9, zstd=1) + pub level: Option, } impl CompressionSettings { /// Create a new CompressionSettings with the specified codec and level. - pub fn new(codec: String, level: u8) -> Self { + pub fn new(codec: String, level: Option) -> Self { Self { codec, level } } @@ -47,7 +47,7 @@ impl Default for CompressionSettings { fn default() -> Self { Self { codec: TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), - level: TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT, + level: None, } } } @@ -57,8 +57,8 @@ impl Default for CompressionSettings { /// /// # Arguments /// -/// * `codec` - The name of the compression codec (e.g., "gzip", "zstd", "deflate", "none") -/// * `level` - The compression level. For deflate/gzip: +/// * `codec` - The name of the compression codec (e.g., "gzip", "zstd", "deflate", "uncompressed") +/// * `level` - The compression level (None uses codec defaults: gzip=9, zstd=1). For deflate/gzip: /// - 0: NoCompression /// - 1: BestSpeed /// - 9: BestCompression @@ -67,9 +67,9 @@ impl Default for CompressionSettings { /// /// # Supported Codecs /// -/// - `gzip` or `deflate`: Uses Deflate compression with specified level -/// - `zstd`: Uses Zstandard compression (level clamped to valid zstd range) -/// - `none` or `None`: No compression +/// - `gzip` or `deflate`: Uses Deflate compression with specified level (default: 9) +/// - `zstd`: Uses Zstandard compression (default: 1, level clamped to valid zstd range 0-22) +/// - `uncompressed` or `None`: No compression /// - Any other value: Defaults to no compression (Codec::Null) /// /// # Compression Levels @@ -80,16 +80,17 @@ impl Default for CompressionSettings { /// - Level 9: Best compression (slower, better compression) /// - Level 10: Uber compression (slowest, best compression) /// - Other: Default level (balanced speed/compression) -pub(crate) fn codec_from_str(codec: Option<&str>, level: u8) -> Codec { +pub(crate) fn codec_from_str(codec: Option<&str>, level: Option) -> Codec { use apache_avro::{DeflateSettings, ZstandardSettings}; match codec { Some("gzip") | Some("deflate") => { // Map compression level to miniz_oxide::deflate::CompressionLevel // Reference: https://docs.rs/miniz_oxide/latest/miniz_oxide/deflate/enum.CompressionLevel.html + // Default level for gzip/deflate is 9 (BestCompression) to match Java use miniz_oxide::deflate::CompressionLevel; - let compression_level = match level { + let compression_level = match level.unwrap_or(9) { 0 => CompressionLevel::NoCompression, 1 => CompressionLevel::BestSpeed, 9 => CompressionLevel::BestCompression, @@ -101,10 +102,11 @@ pub(crate) fn codec_from_str(codec: Option<&str>, level: u8) -> Codec { } Some("zstd") => { // Zstandard supports levels 0-22, clamp to valid range - let zstd_level = level.min(22); + // Default level for zstd is 1 to match Java + let zstd_level = level.unwrap_or(1).min(22); Codec::Zstandard(ZstandardSettings::new(zstd_level)) } - Some("none") | None => Codec::Null, + Some("uncompressed") | None => Codec::Null, Some(unknown) => { warn!( "Unrecognized compression codec '{}', using no compression (Codec::Null)", @@ -123,43 +125,65 @@ mod tests { use apache_avro::{Schema, Writer}; use super::*; + use apache_avro::{DeflateSettings, ZstandardSettings}; + use miniz_oxide::deflate::CompressionLevel; #[test] fn test_codec_from_str_gzip() { - let codec = codec_from_str(Some("gzip"), 5); - assert!(matches!(codec, Codec::Deflate(_))); + let codec = codec_from_str(Some("gzip"), Some(5)); + assert_eq!( + codec, + Codec::Deflate(DeflateSettings::new(CompressionLevel::DefaultLevel)) + ); } #[test] fn test_codec_from_str_deflate() { - let codec = codec_from_str(Some("deflate"), 9); - assert!(matches!(codec, Codec::Deflate(_))); + let codec = codec_from_str(Some("deflate"), Some(9)); + assert_eq!( + codec, + Codec::Deflate(DeflateSettings::new(CompressionLevel::BestCompression)) + ); } #[test] fn test_codec_from_str_zstd() { - let codec = codec_from_str(Some("zstd"), 3); - assert!(matches!(codec, Codec::Zstandard(_))); + let codec = codec_from_str(Some("zstd"), Some(3)); + assert_eq!(codec, Codec::Zstandard(ZstandardSettings::new(3))); } #[test] - fn test_codec_from_str_none() { - let codec = codec_from_str(Some("none"), 0); + fn test_codec_from_str_uncompressed() { + let codec = codec_from_str(Some("uncompressed"), None); assert!(matches!(codec, Codec::Null)); } #[test] fn test_codec_from_str_null() { - let codec = codec_from_str(None, 0); + let codec = codec_from_str(None, None); assert!(matches!(codec, Codec::Null)); } #[test] fn test_codec_from_str_unknown() { - let codec = codec_from_str(Some("unknown"), 1); + let codec = codec_from_str(Some("unknown"), Some(1)); assert!(matches!(codec, Codec::Null)); } + #[test] + fn test_codec_from_str_gzip_default_level() { + // Test that None level defaults to 9 for gzip + let codec = codec_from_str(Some("gzip"), None); + assert_eq!(codec, Codec::Deflate(DeflateSettings::new(CompressionLevel::BestCompression))); + } + + #[test] + fn test_codec_from_str_zstd_default_level() { + // Test that None level defaults to 1 for zstd + let codec = codec_from_str(Some("zstd"), None); + assert_eq!(codec, Codec::Zstandard(ZstandardSettings::new(1))); + } + #[test] fn test_codec_from_str_deflate_levels() { // Create a simple schema for testing @@ -171,7 +195,7 @@ mod tests { // Test that different compression levels produce different output sizes let mut sizes = HashMap::new(); for level in [0, 1, 5, 9, 10] { - let codec = codec_from_str(Some("gzip"), level); + let codec = codec_from_str(Some("gzip"), Some(level)); let mut writer = Writer::with_codec(&schema, Vec::new(), codec); let mut record = Record::new(&schema).unwrap(); @@ -203,7 +227,7 @@ mod tests { // Test various levels by checking they produce valid codecs for level in [0, 3, 15, 22] { - let codec = codec_from_str(Some("zstd"), level); + let codec = codec_from_str(Some("zstd"), Some(level)); assert!(matches!(codec, Codec::Zstandard(_))); // Verify the codec actually works by compressing data @@ -217,8 +241,8 @@ mod tests { } // Test clamping - higher than 22 should be clamped to 22 - let codec_100 = codec_from_str(Some("zstd"), 100); - let codec_22 = codec_from_str(Some("zstd"), 22); + let codec_100 = codec_from_str(Some("zstd"), Some(100)); + let codec_22 = codec_from_str(Some("zstd"), Some(22)); // Both should work and produce similar results let mut writer_100 = Writer::with_codec(&schema, Vec::new(), codec_100); @@ -250,14 +274,14 @@ mod tests { let test_str = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; // Test gzip level 0 (no compression) vs level 9 (best compression) - let codec_0 = codec_from_str(Some("gzip"), 0); + let codec_0 = codec_from_str(Some("gzip"), Some(0)); let mut writer_0 = Writer::with_codec(&schema, Vec::new(), codec_0); let mut record_0 = Record::new(&schema).unwrap(); record_0.put("field", test_str); writer_0.append(record_0).unwrap(); let size_0 = writer_0.into_inner().unwrap().len(); - let codec_9 = codec_from_str(Some("gzip"), 9); + let codec_9 = codec_from_str(Some("gzip"), Some(9)); let mut writer_9 = Writer::with_codec(&schema, Vec::new(), codec_9); let mut record_9 = Record::new(&schema).unwrap(); record_9.put("field", test_str); diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index 656e1985d1..33d3c37ddc 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -65,6 +65,7 @@ impl ManifestWriterBuilder { key_metadata: Option>, schema: SchemaRef, partition_spec: PartitionSpec, + compression: CompressionSettings, ) -> Self { let location = output.location().to_owned(); Self { @@ -74,7 +75,7 @@ impl ManifestWriterBuilder { key_metadata, schema, partition_spec, - compression: CompressionSettings::default(), + compression, } } @@ -100,12 +101,6 @@ impl ManifestWriterBuilder { } } - /// Set compression settings for the manifest file. - pub fn with_compression(mut self, compression: CompressionSettings) -> Self { - self.compression = compression; - self - } - /// Build a [`ManifestWriter`] for format version 1. pub fn build_v1(self) -> ManifestWriter { let metadata = ManifestMetadata::builder() diff --git a/crates/iceberg/src/spec/table_properties.rs b/crates/iceberg/src/spec/table_properties.rs index 8ae668ccbc..57c32438e6 100644 --- a/crates/iceberg/src/spec/table_properties.rs +++ b/crates/iceberg/src/spec/table_properties.rs @@ -116,8 +116,8 @@ pub struct TableProperties { pub metadata_compression_codec: CompressionCodec, /// Compression codec for Avro files (manifests, manifest lists) pub avro_compression_codec: String, - /// Compression level for Avro files - pub avro_compression_level: u8, + /// Compression level for Avro files (None uses codec-specific defaults: gzip=9, zstd=1) + pub avro_compression_level: Option, /// Whether to use `FanoutWriter` for partitioned tables. pub write_datafusion_fanout_enabled: bool, /// Whether garbage collection is enabled on drop. @@ -244,8 +244,8 @@ impl TableProperties { /// Compression level for Avro files pub const PROPERTY_AVRO_COMPRESSION_LEVEL: &str = "write.avro.compression-level"; - /// Default Avro compression level (9 = BestCompression) - pub const PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT: u8 = 9; + /// Default Avro compression level (None, uses codec-specific defaults: gzip=9, zstd=1) + pub const PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT: Option = None; /// Whether to use `FanoutWriter` for partitioned tables (handles unsorted data). /// If false, uses `ClusteredWriter` (requires sorted data, more memory efficient). @@ -345,11 +345,20 @@ impl TryFrom<&HashMap> for TableProperties { TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC, TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), )?, - avro_compression_level: parse_property( - props, - TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL, - TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT, - )?, + avro_compression_level: props + .get(TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL) + .map(|v| { + v.parse::().map_err(|e| { + Error::new( + ErrorKind::DataInvalid, + format!( + "Invalid value for {}: {e}", + TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL + ), + ) + }) + }) + .transpose()?, write_datafusion_fanout_enabled: parse_property( props, TableProperties::PROPERTY_DATAFUSION_WRITE_FANOUT_ENABLED, @@ -511,7 +520,7 @@ mod tests { CompressionCodec::gzip_default() ); assert_eq!(table_properties.avro_compression_codec, "zstd"); - assert_eq!(table_properties.avro_compression_level, 3); + assert_eq!(table_properties.avro_compression_level, Some(3)); } #[test] @@ -903,4 +912,32 @@ mod tests { let tp = TableProperties::try_from(&props).unwrap(); assert!(!tp.cdc_enabled); } + + #[test] + fn test_table_properties_optional_compression_level() { + // Test that compression level is None when not specified + let props = HashMap::new(); + let table_properties = TableProperties::try_from(&props).unwrap(); + assert_eq!(table_properties.avro_compression_level, None); + + // Test that compression level is Some(value) when specified + let props = HashMap::from([( + TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL.to_string(), + "5".to_string(), + )]); + let table_properties = TableProperties::try_from(&props).unwrap(); + assert_eq!(table_properties.avro_compression_level, Some(5)); + + // Test that invalid compression level returns error + let props = HashMap::from([( + TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL.to_string(), + "invalid".to_string(), + )]); + let result = TableProperties::try_from(&props); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("Invalid value for write.avro.compression-level")); + } } diff --git a/crates/iceberg/src/transaction/snapshot.rs b/crates/iceberg/src/transaction/snapshot.rs index 3fa7fc4f06..ebd51848ea 100644 --- a/crates/iceberg/src/transaction/snapshot.rs +++ b/crates/iceberg/src/transaction/snapshot.rs @@ -277,8 +277,8 @@ impl<'a> SnapshotProducer<'a> { .default_partition_spec() .as_ref() .clone(), - ) - .with_compression(compression); + compression, + ); match self.table.metadata().format_version() { FormatVersion::V1 => Ok(builder.build_v1()), From 613a67dfa3acdbfb896f3f127a870cdce2cca9bd Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 16 Nov 2025 00:41:10 +0000 Subject: [PATCH 05/33] address comments --- crates/iceberg/src/io/object_cache.rs | 7 +- crates/iceberg/src/scan/mod.rs | 11 ++- crates/iceberg/src/spec/avro_util.rs | 10 +- crates/iceberg/src/spec/manifest/mod.rs | 8 +- crates/iceberg/src/spec/manifest/writer.rs | 23 +---- crates/iceberg/src/spec/manifest_list/mod.rs | 26 ++---- .../iceberg/src/spec/manifest_list/writer.rs | 92 ++++++++----------- crates/iceberg/src/spec/table_properties.rs | 10 +- crates/iceberg/src/transaction/snapshot.rs | 18 ++-- 9 files changed, 88 insertions(+), 117 deletions(-) diff --git a/crates/iceberg/src/io/object_cache.rs b/crates/iceberg/src/io/object_cache.rs index d724c6ecda..6f129d23b1 100644 --- a/crates/iceberg/src/io/object_cache.rs +++ b/crates/iceberg/src/io/object_cache.rs @@ -216,8 +216,9 @@ mod tests { use crate::TableIdent; use crate::io::{FileIO, OutputFile}; use crate::spec::{ - DataContentType, DataFileBuilder, DataFileFormat, Literal, ManifestEntry, - ManifestListWriter, ManifestStatus, ManifestWriterBuilder, Struct, TableMetadata, + CompressionSettings, DataContentType, DataFileBuilder, DataFileFormat, Literal, + ManifestEntry, ManifestListWriter, ManifestStatus, ManifestWriterBuilder, Struct, + TableMetadata, }; use crate::table::Table; use crate::test_utils::test_runtime; @@ -296,6 +297,7 @@ mod tests { None, current_schema.clone(), current_partition_spec.as_ref().clone(), + CompressionSettings::default(), ) .build_v2_data(); writer @@ -333,6 +335,7 @@ mod tests { current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), + CompressionSettings::default(), ); manifest_list_write .add_manifests(vec![data_file_manifest].into_iter()) diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 86092313c6..2a373b38e5 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -644,9 +644,10 @@ pub mod tests { use crate::metadata_columns::RESERVED_COL_NAME_FILE; use crate::scan::FileScanTask; use crate::spec::{ - DEFAULT_SCHEMA_NAME_MAPPING, DataContentType, DataFileBuilder, DataFileFormat, Datum, - Literal, ManifestEntry, ManifestListWriter, ManifestStatus, ManifestWriterBuilder, - NestedField, PartitionSpec, PrimitiveType, Schema, Struct, StructType, TableMetadata, Type, + CompressionSettings, DEFAULT_SCHEMA_NAME_MAPPING, DataContentType, DataFileBuilder, + DataFileFormat, Datum, Literal, ManifestEntry, ManifestListWriter, ManifestStatus, + ManifestWriterBuilder, NestedField, PartitionSpec, PrimitiveType, Schema, Struct, + StructType, TableMetadata, Type, }; use crate::table::Table; use crate::test_utils::test_runtime; @@ -848,6 +849,7 @@ pub mod tests { None, current_schema.clone(), current_partition_spec.as_ref().clone(), + CompressionSettings::default(), ) .build_v2_data(); writer @@ -930,6 +932,7 @@ pub mod tests { current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), + CompressionSettings::default(), ); manifest_list_write .add_manifests(vec![data_file_manifest].into_iter()) @@ -1077,6 +1080,7 @@ pub mod tests { None, current_schema.clone(), current_partition_spec.as_ref().clone(), + CompressionSettings::default(), ) .build_v2_data(); @@ -1166,6 +1170,7 @@ pub mod tests { current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), + CompressionSettings::default(), ); manifest_list_write .add_manifests(vec![data_file_manifest].into_iter()) diff --git a/crates/iceberg/src/spec/avro_util.rs b/crates/iceberg/src/spec/avro_util.rs index 7371a46c85..77baf32b7f 100644 --- a/crates/iceberg/src/spec/avro_util.rs +++ b/crates/iceberg/src/spec/avro_util.rs @@ -122,11 +122,10 @@ mod tests { use std::collections::HashMap; use apache_avro::types::Record; - use apache_avro::{Schema, Writer}; + use apache_avro::{DeflateSettings, Schema, Writer, ZstandardSettings}; + use miniz_oxide::deflate::CompressionLevel; use super::*; - use apache_avro::{DeflateSettings, ZstandardSettings}; - use miniz_oxide::deflate::CompressionLevel; #[test] fn test_codec_from_str_gzip() { @@ -174,7 +173,10 @@ mod tests { fn test_codec_from_str_gzip_default_level() { // Test that None level defaults to 9 for gzip let codec = codec_from_str(Some("gzip"), None); - assert_eq!(codec, Codec::Deflate(DeflateSettings::new(CompressionLevel::BestCompression))); + assert_eq!( + codec, + Codec::Deflate(DeflateSettings::new(CompressionLevel::BestCompression)) + ); } #[test] diff --git a/crates/iceberg/src/spec/manifest/mod.rs b/crates/iceberg/src/spec/manifest/mod.rs index fca753ecb2..6c9ab1e1af 100644 --- a/crates/iceberg/src/spec/manifest/mod.rs +++ b/crates/iceberg/src/spec/manifest/mod.rs @@ -167,7 +167,7 @@ mod tests { use super::*; use crate::io::FileIO; - use crate::spec::{Literal, NestedField, PrimitiveType, Struct, Transform, Type}; + use crate::spec::{CompressionSettings, Literal, NestedField, PrimitiveType, Struct, Transform, Type}; #[tokio::test] async fn test_parse_manifest_v2_unpartition() { @@ -273,6 +273,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), + CompressionSettings::default(), ) .build_v2_data(); for entry in &entries { @@ -575,6 +576,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), + CompressionSettings::default(), ) .build_v2_data(); for entry in &entries { @@ -672,6 +674,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), + CompressionSettings::default(), ) .build_v1(); for entry in &entries { @@ -781,6 +784,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), + CompressionSettings::default(), ) .build_v1(); for entry in &entries { @@ -889,6 +893,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), + CompressionSettings::default(), ) .build_v2_data(); for entry in &entries { @@ -1168,6 +1173,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), + CompressionSettings::default(), ) .build_v2_data(); for entry in &entries { diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index 33d3c37ddc..a38ef0ba5f 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -79,28 +79,6 @@ impl ManifestWriterBuilder { } } - /// Create a new builder from an [`EncryptedOutputFile`]. - /// - /// Use this when writing manifests with transparent encryption. - pub fn new_from_encrypted( - encrypted_output: EncryptedOutputFile, - snapshot_id: Option, - key_metadata: Option>, - schema: SchemaRef, - partition_spec: PartitionSpec, - ) -> Self { - let location = encrypted_output.location().to_owned(); - Self { - writer_future: Box::pin(async move { encrypted_output.writer().await }), - location, - snapshot_id, - key_metadata, - schema, - partition_spec, - compression: CompressionSettings::default(), - } - } - /// Build a [`ManifestWriter`] for format version 1. pub fn build_v1(self) -> ManifestWriter { let metadata = ManifestMetadata::builder() @@ -753,6 +731,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), + CompressionSettings::default(), ) .build_v2_data(); writer.add_entry(entries[0].clone()).unwrap(); diff --git a/crates/iceberg/src/spec/manifest_list/mod.rs b/crates/iceberg/src/spec/manifest_list/mod.rs index abdbd41fbc..4899d534ba 100644 --- a/crates/iceberg/src/spec/manifest_list/mod.rs +++ b/crates/iceberg/src/spec/manifest_list/mod.rs @@ -100,7 +100,7 @@ mod test { use super::_serde::ManifestFileV2; use super::*; use crate::io::FileIO; - use crate::spec::{Datum, FieldSummary, ManifestContentType, ManifestFile}; + use crate::spec::{CompressionSettings, Datum, FieldSummary, ManifestContentType, ManifestFile}; #[tokio::test] async fn test_parse_manifest_list_v1() { @@ -134,14 +134,10 @@ mod test { let full_path = format!("{}/{}", tmp_dir.path().to_str().unwrap(), file_name); let mut writer = ManifestListWriter::v1( - file_io - .new_output(full_path.clone()) - .unwrap() - .writer() - .await - .unwrap(), + file_io.new_output(full_path.clone()).unwrap(), 1646658105718557341, Some(1646658105718557341), + CompressionSettings::default(), ); writer @@ -211,15 +207,11 @@ mod test { let full_path = format!("{}/{}", tmp_dir.path().to_str().unwrap(), file_name); let mut writer = ManifestListWriter::v2( - file_io - .new_output(full_path.clone()) - .unwrap() - .writer() - .await - .unwrap(), + file_io.new_output(full_path.clone()).unwrap(), 1646658105718557341, Some(1646658105718557341), 1, + CompressionSettings::default(), ); writer @@ -329,16 +321,12 @@ mod test { let full_path = format!("{}/{}", tmp_dir.path().to_str().unwrap(), file_name); let mut writer = ManifestListWriter::v3( - file_io - .new_output(full_path.clone()) - .unwrap() - .writer() - .await - .unwrap(), + file_io.new_output(full_path.clone()).unwrap(), 377075049360453639, Some(377075049360453639), 1, Some(10), + CompressionSettings::default(), ); writer diff --git a/crates/iceberg/src/spec/manifest_list/writer.rs b/crates/iceberg/src/spec/manifest_list/writer.rs index e1c6efa3f6..947ac32340 100644 --- a/crates/iceberg/src/spec/manifest_list/writer.rs +++ b/crates/iceberg/src/spec/manifest_list/writer.rs @@ -26,25 +26,25 @@ use super::_const_schema::{ use super::_serde::{ManifestFileV1, ManifestFileV2, ManifestFileV3}; use super::{FormatVersion, ManifestContentType, ManifestFile, UNASSIGNED_SEQUENCE_NUMBER}; use crate::error::Result; -use crate::io::FileWrite; +use crate::io::OutputFile; use crate::spec::avro_util::CompressionSettings; use crate::{Error, ErrorKind}; /// A manifest list writer. pub struct ManifestListWriter { format_version: FormatVersion, - writer: Box, + output_file: OutputFile, avro_writer: Writer<'static, Vec>, sequence_number: i64, snapshot_id: i64, next_row_id: Option, - compression: CompressionSettings, } impl std::fmt::Debug for ManifestListWriter { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("ManifestListWriter") .field("format_version", &self.format_version) + .field("output_file", &self.output_file) .field("avro_writer", &self.avro_writer.schema()) .finish_non_exhaustive() } @@ -56,24 +56,12 @@ impl ManifestListWriter { self.next_row_id } - /// Set compression settings for the manifest list file. - pub fn with_compression(mut self, compression: CompressionSettings) -> Self { - let avro_schema = match self.format_version { - FormatVersion::V1 => &MANIFEST_LIST_AVRO_SCHEMA_V1, - FormatVersion::V2 => &MANIFEST_LIST_AVRO_SCHEMA_V2, - FormatVersion::V3 => &MANIFEST_LIST_AVRO_SCHEMA_V3, - }; - let codec = compression.to_codec(); - self.avro_writer = Writer::with_codec(avro_schema, Vec::new(), codec); - self.compression = compression; - self - } - - /// Construct a v1 [`ManifestListWriter`] that writes to a provided [`FileWrite`]. + /// Construct a v1 [`ManifestListWriter`] that writes to a provided [`OutputFile`]. pub fn v1( - writer: Box, + output_file: OutputFile, snapshot_id: i64, parent_snapshot_id: Option, + compression_settings: CompressionSettings, ) -> Self { let mut metadata = HashMap::from_iter([ ("snapshot-id".to_string(), snapshot_id.to_string()), @@ -87,21 +75,22 @@ impl ManifestListWriter { } Self::new( FormatVersion::V1, - writer, + output_file, metadata, 0, snapshot_id, None, - CompressionSettings::default(), + compression_settings, ) } - /// Construct a v2 [`ManifestListWriter`] that writes to a provided [`FileWrite`]. + /// Construct a v2 [`ManifestListWriter`] that writes to a provided [`OutputFile`]. pub fn v2( - writer: Box, + output_file: OutputFile, snapshot_id: i64, parent_snapshot_id: Option, sequence_number: i64, + compression_settings: CompressionSettings, ) -> Self { let mut metadata = HashMap::from_iter([ ("snapshot-id".to_string(), snapshot_id.to_string()), @@ -116,22 +105,23 @@ impl ManifestListWriter { ); Self::new( FormatVersion::V2, - writer, + output_file, metadata, sequence_number, snapshot_id, None, - CompressionSettings::default(), + compression_settings, ) } - /// Construct a v3 [`ManifestListWriter`] that writes to a provided [`FileWrite`]. + /// Construct a v3 [`ManifestListWriter`] that writes to a provided [`OutputFile`]. pub fn v3( - writer: Box, + output_file: OutputFile, snapshot_id: i64, parent_snapshot_id: Option, sequence_number: i64, first_row_id: Option, // Always None for delete manifests + compression_settings: CompressionSettings, ) -> Self { let mut metadata = HashMap::from_iter([ ("snapshot-id".to_string(), snapshot_id.to_string()), @@ -152,18 +142,18 @@ impl ManifestListWriter { ); Self::new( FormatVersion::V3, - writer, + output_file, metadata, sequence_number, snapshot_id, first_row_id, - CompressionSettings::default(), + compression_settings, ) } fn new( format_version: FormatVersion, - writer: Box, + output_file: OutputFile, metadata: HashMap, sequence_number: i64, snapshot_id: i64, @@ -184,12 +174,11 @@ impl ManifestListWriter { } Self { format_version, - writer, + output_file, avro_writer, sequence_number, snapshot_id, next_row_id: first_row_id, - compression, } } @@ -227,8 +216,9 @@ impl ManifestListWriter { /// Write the manifest list to the output file. pub async fn close(mut self) -> Result<()> { let data = self.avro_writer.into_inner()?; - self.writer.write(Bytes::from(data)).await?; - self.writer.close().await?; + let mut writer = self.output_file.writer().await?; + writer.write(Bytes::from(data)).await?; + writer.close().await?; Ok(()) } @@ -347,9 +337,9 @@ mod test { use tempfile::TempDir; use super::ManifestListWriter; - use crate::io::{FileIO, FileWrite}; + use crate::io::FileIO; use crate::spec::{ - Datum, FieldSummary, ManifestContentType, ManifestFile, ManifestList, + CompressionSettings, Datum, FieldSummary, ManifestContentType, ManifestFile, ManifestList, UNASSIGNED_SEQUENCE_NUMBER, }; @@ -381,9 +371,9 @@ mod test { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("manifest_list_v1.avro"); let io = FileIO::new_with_fs(); - let file_writer = file_writer(&path, io).await; + let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v1(file_writer, 1646658105718557341, Some(0)); + let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0), CompressionSettings::default()); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -428,9 +418,9 @@ mod test { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("manifest_list_v2.avro"); let io = FileIO::new_with_fs(); - let file_writer = file_writer(&path, io).await; + let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v2(file_writer, snapshot_id, Some(0), seq_num); + let mut writer = ManifestListWriter::v2(output_file, snapshot_id, Some(0), seq_num, CompressionSettings::default()); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -476,10 +466,10 @@ mod test { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("manifest_list_v2.avro"); let io = FileIO::new_with_fs(); - let file_writer = file_writer(&path, io).await; + let output_file = output_file(&path, &io); let mut writer = - ManifestListWriter::v3(file_writer, snapshot_id, Some(0), seq_num, Some(10)); + ManifestListWriter::v3(output_file, snapshot_id, Some(0), seq_num, Some(10), CompressionSettings::default()); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -524,9 +514,9 @@ mod test { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("manifest_list_v1.avro"); let io = FileIO::new_with_fs(); - let file_writer = file_writer(&path, io).await; + let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v1(file_writer, 1646658105718557341, Some(0)); + let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0), CompressionSettings::default()); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -569,9 +559,9 @@ mod test { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("manifest_list_v1.avro"); let io = FileIO::new_with_fs(); - let file_writer = file_writer(&path, io).await; + let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v1(file_writer, 1646658105718557341, Some(0)); + let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0), CompressionSettings::default()); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -616,9 +606,9 @@ mod test { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("manifest_list_v2.avro"); let io = FileIO::new_with_fs(); - let file_writer = file_writer(&path, io).await; + let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v2(file_writer, snapshot_id, Some(0), seq_num); + let mut writer = ManifestListWriter::v2(output_file, snapshot_id, Some(0), seq_num, CompressionSettings::default()); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -635,11 +625,7 @@ mod test { temp_dir.close().unwrap(); } - async fn file_writer(path: &Path, io: FileIO) -> Box { - io.new_output(path.to_str().unwrap()) - .unwrap() - .writer() - .await - .unwrap() + fn output_file(path: &Path, io: &FileIO) -> crate::io::OutputFile { + io.new_output(path.to_str().unwrap()).unwrap() } } diff --git a/crates/iceberg/src/spec/table_properties.rs b/crates/iceberg/src/spec/table_properties.rs index 57c32438e6..83404402e0 100644 --- a/crates/iceberg/src/spec/table_properties.rs +++ b/crates/iceberg/src/spec/table_properties.rs @@ -935,9 +935,11 @@ mod tests { )]); let result = TableProperties::try_from(&props); assert!(result.is_err()); - assert!(result - .unwrap_err() - .to_string() - .contains("Invalid value for write.avro.compression-level")); + assert!( + result + .unwrap_err() + .to_string() + .contains("Invalid value for write.avro.compression-level") + ); } } diff --git a/crates/iceberg/src/transaction/snapshot.rs b/crates/iceberg/src/transaction/snapshot.rs index ebd51848ea..431a0ee84f 100644 --- a/crates/iceberg/src/transaction/snapshot.rs +++ b/crates/iceberg/src/transaction/snapshot.rs @@ -457,12 +457,10 @@ impl<'a> SnapshotProducer<'a> { let manifest_list_path = self.generate_manifest_list_file_path(0); let next_seq_num = self.table.metadata().next_sequence_number(); let first_row_id = self.table.metadata().next_row_id(); - let writer = self + let output_file = self .table .file_io() - .new_output(manifest_list_path.clone())? - .writer() - .await?; + .new_output(manifest_list_path.clone())?; let table_props = TableProperties::try_from(self.table.metadata().properties()).map_err(|e| { Error::new( @@ -478,25 +476,27 @@ impl<'a> SnapshotProducer<'a> { let mut manifest_list_writer = match self.table.metadata().format_version() { FormatVersion::V1 => ManifestListWriter::v1( - writer, + output_file, self.snapshot_id, self.table.metadata().current_snapshot_id(), + compression.clone(), ), FormatVersion::V2 => ManifestListWriter::v2( - writer, + output_file, self.snapshot_id, self.table.metadata().current_snapshot_id(), next_seq_num, + compression.clone(), ), FormatVersion::V3 => ManifestListWriter::v3( - writer, + output_file, self.snapshot_id, self.table.metadata().current_snapshot_id(), next_seq_num, Some(first_row_id), + compression, ), - } - .with_compression(compression); + }; // Calling self.summary() before self.manifest_file() is important because self.added_data_files // will be set to an empty vec after self.manifest_file() returns, resulting in an empty summary From 52668a8b5d04fb6e7ed47d5292cf68699df7f0dd Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 16 Nov 2025 00:52:36 +0000 Subject: [PATCH 06/33] no clone needed --- crates/iceberg/src/transaction/snapshot.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/iceberg/src/transaction/snapshot.rs b/crates/iceberg/src/transaction/snapshot.rs index 431a0ee84f..00a6214ba5 100644 --- a/crates/iceberg/src/transaction/snapshot.rs +++ b/crates/iceberg/src/transaction/snapshot.rs @@ -479,14 +479,14 @@ impl<'a> SnapshotProducer<'a> { output_file, self.snapshot_id, self.table.metadata().current_snapshot_id(), - compression.clone(), + compression, ), FormatVersion::V2 => ManifestListWriter::v2( output_file, self.snapshot_id, self.table.metadata().current_snapshot_id(), next_seq_num, - compression.clone(), + compression, ), FormatVersion::V3 => ManifestListWriter::v3( output_file, From 05a32e5003565e43915a9fa8c4eac17aee149f9b Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 16 Nov 2025 01:22:27 +0000 Subject: [PATCH 07/33] test compression works --- crates/iceberg/src/spec/manifest/writer.rs | 127 ++++++++++++++++++++- 1 file changed, 125 insertions(+), 2 deletions(-) diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index a38ef0ba5f..8a07fdfade 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -598,8 +598,12 @@ mod tests { use tempfile::TempDir; use super::*; - use crate::io::FileIO; - use crate::spec::{DataFileFormat, Manifest, NestedField, PrimitiveType, Schema, Struct, Type}; + use crate::io::{FileIO, FileIOBuilder}; + use crate::spec::{ + DataContentType, DataFileBuilder, DataFileFormat, Manifest, ManifestContentType, + ManifestEntry, ManifestMetadata, ManifestStatus, NestedField, PartitionSpec, PrimitiveType, + Schema, Struct, Type, + }; #[tokio::test] async fn test_add_delete_existing() { @@ -820,6 +824,7 @@ mod tests { None, schema.clone(), partition_spec.clone(), + CompressionSettings::default(), ) .build_v3_deletes(); @@ -840,4 +845,122 @@ mod tests { ManifestContentType::Deletes, ); } + + #[tokio::test] + async fn test_manifest_writer_with_compression() { + let metadata = { + let schema = Schema::builder() + .with_fields(vec![Arc::new(NestedField::required( + 1, + "id", + Type::Primitive(PrimitiveType::Int), + ))]) + .build() + .unwrap(); + + ManifestMetadata { + schema_id: 0, + schema: Arc::new(schema), + partition_spec: PartitionSpec::unpartition_spec(), + format_version: FormatVersion::V2, + content: ManifestContentType::Data, + } + }; + + // Write uncompressed manifest with multiple entries to make compression effective + let tmp_dir = TempDir::new().unwrap(); + let uncompressed_path = tmp_dir.path().join("uncompressed_manifest.avro"); + let io = FileIOBuilder::new_fs_io().build().unwrap(); + let output_file = io.new_output(uncompressed_path.to_str().unwrap()).unwrap(); + let uncompressed_settings = CompressionSettings::new("uncompressed".to_string(), None); + let mut writer = ManifestWriterBuilder::new( + output_file, + Some(1), + None, + metadata.schema.clone(), + metadata.partition_spec.clone(), + uncompressed_settings, + ) + .build_v2_data(); + // Add multiple entries with long paths to create compressible data + for i in 0..1000 { + let data_file = DataFileBuilder::default() + .content(DataContentType::Data) + .file_path(format!( + "/very/long/path/to/data/directory/with/many/subdirectories/file_{}.parquet", + i + )) + .file_format(DataFileFormat::Parquet) + .partition(Struct::empty()) + .file_size_in_bytes(100000 + i) + .record_count(1000 + i) + .build() + .unwrap(); + + let entry = ManifestEntry::builder() + .status(ManifestStatus::Added) + .snapshot_id(1) + .sequence_number(1) + .file_sequence_number(1) + .data_file(data_file) + .build(); + writer.add_entry(entry).unwrap(); + } + writer.write_manifest_file().await.unwrap(); + let uncompressed_size = fs::metadata(&uncompressed_path).unwrap().len(); + + // Write compressed manifest with gzip + let compressed_path = tmp_dir.path().join("compressed_manifest.avro"); + let output_file = io.new_output(compressed_path.to_str().unwrap()).unwrap(); + let compression = CompressionSettings::new("gzip".to_string(), Some(9)); + let mut writer = ManifestWriterBuilder::new( + output_file, + Some(1), + None, + metadata.schema.clone(), + metadata.partition_spec.clone(), + compression, + ) + .build_v2_data(); + // Add the same entries with long paths as the uncompressed version + for i in 0..1000 { + let data_file = DataFileBuilder::default() + .content(DataContentType::Data) + .file_path(format!( + "/very/long/path/to/data/directory/with/many/subdirectories/file_{}.parquet", + i + )) + .file_format(DataFileFormat::Parquet) + .partition(Struct::empty()) + .file_size_in_bytes(100000 + i) + .record_count(1000 + i) + .build() + .unwrap(); + + let entry = ManifestEntry::builder() + .status(ManifestStatus::Added) + .snapshot_id(1) + .sequence_number(1) + .file_sequence_number(1) + .data_file(data_file) + .build(); + writer.add_entry(entry).unwrap(); + } + writer.write_manifest_file().await.unwrap(); + let compressed_size = fs::metadata(&compressed_path).unwrap().len(); + + // Verify compression is actually working + assert!( + compressed_size < uncompressed_size, + "Compressed size ({}) should be less than uncompressed size ({})", + compressed_size, + uncompressed_size + ); + + // Verify the compressed file can be read back correctly + let compressed_bytes = fs::read(&compressed_path).unwrap(); + let manifest = Manifest::parse_avro(&compressed_bytes).unwrap(); + assert_eq!(manifest.metadata.format_version, FormatVersion::V2); + assert_eq!(manifest.entries.len(), 1000); + } } From 129735fd778d1dec4e4140415568b78af7b86c56 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 17 Nov 2025 21:51:19 +0000 Subject: [PATCH 08/33] comments --- crates/iceberg/src/spec/avro_util.rs | 24 +++++++++++----------- crates/iceberg/src/spec/manifest/writer.rs | 1 - crates/iceberg/src/transaction/snapshot.rs | 4 ++-- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/crates/iceberg/src/spec/avro_util.rs b/crates/iceberg/src/spec/avro_util.rs index 77baf32b7f..846bf80aef 100644 --- a/crates/iceberg/src/spec/avro_util.rs +++ b/crates/iceberg/src/spec/avro_util.rs @@ -25,7 +25,7 @@ use crate::spec::TableProperties; /// Settings for compression codec and level. #[derive(Debug, Clone, PartialEq, Eq)] pub struct CompressionSettings { - /// The compression codec name (e.g., "gzip", "zstd", "deflate", "uncompressed") + /// The compression codec name (e.g., "gzip", "zstd", "snappy", "uncompressed") pub codec: String, /// The compression level (None uses codec-specific defaults: gzip=9, zstd=1) pub level: Option, @@ -57,8 +57,8 @@ impl Default for CompressionSettings { /// /// # Arguments /// -/// * `codec` - The name of the compression codec (e.g., "gzip", "zstd", "deflate", "uncompressed") -/// * `level` - The compression level (None uses codec defaults: gzip=9, zstd=1). For deflate/gzip: +/// * `codec` - The name of the compression codec (e.g., "gzip", "zstd", "snappy", "uncompressed") +/// * `level` - The compression level (None uses codec defaults: gzip=9, zstd=1). For gzip: /// - 0: NoCompression /// - 1: BestSpeed /// - 9: BestCompression @@ -67,8 +67,9 @@ impl Default for CompressionSettings { /// /// # Supported Codecs /// -/// - `gzip` or `deflate`: Uses Deflate compression with specified level (default: 9) +/// - `gzip`: Uses Deflate compression with specified level (default: 9) /// - `zstd`: Uses Zstandard compression (default: 1, level clamped to valid zstd range 0-22) +/// - `snappy`: Uses Snappy compression (level parameter ignored) /// - `uncompressed` or `None`: No compression /// - Any other value: Defaults to no compression (Codec::Null) /// @@ -83,8 +84,9 @@ impl Default for CompressionSettings { pub(crate) fn codec_from_str(codec: Option<&str>, level: Option) -> Codec { use apache_avro::{DeflateSettings, ZstandardSettings}; - match codec { - Some("gzip") | Some("deflate") => { + // Use case-insensitive comparison to match Java implementation + match codec.map(|s| s.to_lowercase()).as_deref() { + Some("gzip") => { // Map compression level to miniz_oxide::deflate::CompressionLevel // Reference: https://docs.rs/miniz_oxide/latest/miniz_oxide/deflate/enum.CompressionLevel.html // Default level for gzip/deflate is 9 (BestCompression) to match Java @@ -106,6 +108,7 @@ pub(crate) fn codec_from_str(codec: Option<&str>, level: Option) -> Codec { let zstd_level = level.unwrap_or(1).min(22); Codec::Zstandard(ZstandardSettings::new(zstd_level)) } + Some("snappy") => Codec::Snappy, Some("uncompressed") | None => Codec::Null, Some(unknown) => { warn!( @@ -137,12 +140,9 @@ mod tests { } #[test] - fn test_codec_from_str_deflate() { - let codec = codec_from_str(Some("deflate"), Some(9)); - assert_eq!( - codec, - Codec::Deflate(DeflateSettings::new(CompressionLevel::BestCompression)) - ); + fn test_codec_from_str_snappy() { + let codec = codec_from_str(Some("snappy"), None); + assert_eq!(codec, Codec::Snappy); } #[test] diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index 8a07fdfade..2eb61f8013 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -444,7 +444,6 @@ impl ManifestWriter { FormatVersion::V2 | FormatVersion::V3 => manifest_schema_v2(&partition_type)?, }; - // Determine compression codec using CompressionSettings let codec = self.compression.to_codec(); let mut avro_writer = AvroWriter::with_codec(&avro_schema, Vec::new(), codec); diff --git a/crates/iceberg/src/transaction/snapshot.rs b/crates/iceberg/src/transaction/snapshot.rs index 00a6214ba5..34b9df4774 100644 --- a/crates/iceberg/src/transaction/snapshot.rs +++ b/crates/iceberg/src/transaction/snapshot.rs @@ -263,7 +263,7 @@ impl<'a> SnapshotProducer<'a> { .with_source(e) })?; let compression = CompressionSettings::new( - table_props.avro_compression_codec.clone(), + table_props.avro_compression_codec, table_props.avro_compression_level, ); @@ -470,7 +470,7 @@ impl<'a> SnapshotProducer<'a> { .with_source(e) })?; let compression = CompressionSettings::new( - table_props.avro_compression_codec.clone(), + table_props.avro_compression_codec, table_props.avro_compression_level, ); From 2e3dfafa776e2e4894e6d882f355bbb72b504b2b Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 17 Nov 2025 21:57:27 +0000 Subject: [PATCH 09/33] update tests --- crates/iceberg/src/spec/avro_util.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/iceberg/src/spec/avro_util.rs b/crates/iceberg/src/spec/avro_util.rs index 846bf80aef..e78f019cfa 100644 --- a/crates/iceberg/src/spec/avro_util.rs +++ b/crates/iceberg/src/spec/avro_util.rs @@ -132,7 +132,8 @@ mod tests { #[test] fn test_codec_from_str_gzip() { - let codec = codec_from_str(Some("gzip"), Some(5)); + // Test with mixed case to verify case-insensitive matching + let codec = codec_from_str(Some("GZip"), Some(5)); assert_eq!( codec, Codec::Deflate(DeflateSettings::new(CompressionLevel::DefaultLevel)) From ba8194748fbc23199eaebaa41a4c6290da625511 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 19 Nov 2025 19:06:53 +0000 Subject: [PATCH 10/33] address comments --- crates/iceberg/src/spec/table_metadata.rs | 3 ++- crates/iceberg/src/spec/table_properties.rs | 8 ++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index 607fd98350..2a51ef7f0a 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -1619,7 +1619,7 @@ mod tests { BlobMetadata, EncryptedKey, INITIAL_ROW_ID, Literal, NestedField, NullOrder, Operation, PartitionSpec, PartitionStatisticsFile, PrimitiveLiteral, PrimitiveType, Schema, Snapshot, SnapshotReference, SnapshotRetention, SortDirection, SortField, SortOrder, StatisticsFile, - Summary, TableProperties, Transform, Type, UnboundPartitionField, + Summary, Transform, Type, UnboundPartitionField, }; use crate::{ErrorKind, TableCreation}; @@ -3718,6 +3718,7 @@ mod tests { assert_eq!(read_metadata, compressed_metadata); } + #[test] fn test_partition_name_exists() { let schema = Schema::builder() diff --git a/crates/iceberg/src/spec/table_properties.rs b/crates/iceberg/src/spec/table_properties.rs index 83404402e0..2b9b6a0c4d 100644 --- a/crates/iceberg/src/spec/table_properties.rs +++ b/crates/iceberg/src/spec/table_properties.rs @@ -232,9 +232,9 @@ impl TableProperties { /// Default target file size pub const PROPERTY_WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT: usize = 512 * 1024 * 1024; // 512 MB - /// Compression codec for metadata files (JSON) + /// Compression codec for metadata files (JSON). pub const PROPERTY_METADATA_COMPRESSION_CODEC: &str = "write.metadata.compression-codec"; - /// Default metadata compression codec - uncompressed + /// Default metadata compression codec pub const PROPERTY_METADATA_COMPRESSION_CODEC_DEFAULT: &str = "none"; /// Compression codec for Avro files (manifests, manifest lists) @@ -501,10 +501,6 @@ mod tests { #[test] fn test_table_properties_compression() { let props = HashMap::from([ - ( - TableProperties::PROPERTY_METADATA_COMPRESSION_CODEC.to_string(), - "gzip".to_string(), - ), ( TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC.to_string(), "zstd".to_string(), From 870841b3aa4ddd146d0db9e80b6704693e13265e Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 19 Nov 2025 21:53:17 +0000 Subject: [PATCH 11/33] remove parse optional property --- crates/iceberg/src/spec/table_properties.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/crates/iceberg/src/spec/table_properties.rs b/crates/iceberg/src/spec/table_properties.rs index 2b9b6a0c4d..fbfdd24a53 100644 --- a/crates/iceberg/src/spec/table_properties.rs +++ b/crates/iceberg/src/spec/table_properties.rs @@ -97,6 +97,7 @@ pub(crate) fn parse_metadata_file_compression( } } + /// TableProperties that contains the properties of a table. #[derive(Debug)] pub struct TableProperties { @@ -345,6 +346,7 @@ impl TryFrom<&HashMap> for TableProperties { TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC, TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), )?, +<<<<<<< HEAD avro_compression_level: props .get(TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL) .map(|v| { @@ -412,6 +414,20 @@ impl TryFrom<&HashMap> for TableProperties { TableProperties::PROPERTY_ENCRYPTION_DATA_KEY_LENGTH, TableProperties::PROPERTY_ENCRYPTION_DATA_KEY_LENGTH_DEFAULT, )?, +======= + avro_compression_level: { + let level = parse_property( + props, + TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL, + 255u8, + )?; + if level == 255 { + None + } else { + Some(level) + } + }, +>>>>>>> 5370f775 (remove parse optional property) }) } } @@ -658,6 +674,7 @@ mod tests { } #[test] +<<<<<<< HEAD fn test_table_properties_compression_invalid_rejected() { let invalid_codecs = ["lz4", "zstd", "snappy"]; @@ -910,6 +927,8 @@ mod tests { } #[test] +======= +>>>>>>> 5370f775 (remove parse optional property) fn test_table_properties_optional_compression_level() { // Test that compression level is None when not specified let props = HashMap::new(); From ea289f6440f09ea9878310144091b486de201989 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 19 Nov 2025 22:01:16 +0000 Subject: [PATCH 12/33] fmt --- crates/iceberg/src/spec/table_properties.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/crates/iceberg/src/spec/table_properties.rs b/crates/iceberg/src/spec/table_properties.rs index fbfdd24a53..b55548d9bf 100644 --- a/crates/iceberg/src/spec/table_properties.rs +++ b/crates/iceberg/src/spec/table_properties.rs @@ -421,11 +421,7 @@ impl TryFrom<&HashMap> for TableProperties { TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL, 255u8, )?; - if level == 255 { - None - } else { - Some(level) - } + if level == 255 { None } else { Some(level) } }, >>>>>>> 5370f775 (remove parse optional property) }) From 08fec14ccd572cbed8c1260c2675909d650cb289 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 21 Nov 2025 18:56:17 +0000 Subject: [PATCH 13/33] wip --- crates/iceberg/src/spec/table_properties.rs | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/crates/iceberg/src/spec/table_properties.rs b/crates/iceberg/src/spec/table_properties.rs index b55548d9bf..d039272c47 100644 --- a/crates/iceberg/src/spec/table_properties.rs +++ b/crates/iceberg/src/spec/table_properties.rs @@ -346,7 +346,6 @@ impl TryFrom<&HashMap> for TableProperties { TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC, TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), )?, -<<<<<<< HEAD avro_compression_level: props .get(TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL) .map(|v| { @@ -414,16 +413,6 @@ impl TryFrom<&HashMap> for TableProperties { TableProperties::PROPERTY_ENCRYPTION_DATA_KEY_LENGTH, TableProperties::PROPERTY_ENCRYPTION_DATA_KEY_LENGTH_DEFAULT, )?, -======= - avro_compression_level: { - let level = parse_property( - props, - TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL, - 255u8, - )?; - if level == 255 { None } else { Some(level) } - }, ->>>>>>> 5370f775 (remove parse optional property) }) } } @@ -670,7 +659,6 @@ mod tests { } #[test] -<<<<<<< HEAD fn test_table_properties_compression_invalid_rejected() { let invalid_codecs = ["lz4", "zstd", "snappy"]; @@ -923,8 +911,6 @@ mod tests { } #[test] -======= ->>>>>>> 5370f775 (remove parse optional property) fn test_table_properties_optional_compression_level() { // Test that compression level is None when not specified let props = HashMap::new(); From 9f22ca840b7fdb0af2f1976419e422978ed1ee9f Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 21 Nov 2025 19:06:07 +0000 Subject: [PATCH 14/33] address comments --- crates/iceberg/src/spec/avro_util.rs | 22 ++++++++------------- crates/iceberg/src/spec/table_properties.rs | 2 +- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/crates/iceberg/src/spec/avro_util.rs b/crates/iceberg/src/spec/avro_util.rs index e78f019cfa..e86a5ecd59 100644 --- a/crates/iceberg/src/spec/avro_util.rs +++ b/crates/iceberg/src/spec/avro_util.rs @@ -27,7 +27,7 @@ use crate::spec::TableProperties; pub struct CompressionSettings { /// The compression codec name (e.g., "gzip", "zstd", "snappy", "uncompressed") pub codec: String, - /// The compression level (None uses codec-specific defaults: gzip=9, zstd=1) + /// The compression level (None uses codec-specific defaults) pub level: Option, } @@ -58,29 +58,23 @@ impl Default for CompressionSettings { /// # Arguments /// /// * `codec` - The name of the compression codec (e.g., "gzip", "zstd", "snappy", "uncompressed") -/// * `level` - The compression level (None uses codec defaults: gzip=9, zstd=1). For gzip: +/// * `level` - The compression level. For gzip/deflate: /// - 0: NoCompression /// - 1: BestSpeed /// - 9: BestCompression /// - 10: UberCompression -/// - Other values: DefaultLevel (6) +/// - 6: DefaultLevel (balanced speed/compression) +/// - Other values: DefaultLevel +/// For zstd, level is clamped to valid range (0-22). +/// When `None`, uses codec-specific defaults. /// /// # Supported Codecs /// -/// - `gzip`: Uses Deflate compression with specified level (default: 9) -/// - `zstd`: Uses Zstandard compression (default: 1, level clamped to valid zstd range 0-22) +/// - `gzip`: Uses Deflate compression with specified level +/// - `zstd`: Uses Zstandard compression (level clamped to valid zstd range 0-22) /// - `snappy`: Uses Snappy compression (level parameter ignored) /// - `uncompressed` or `None`: No compression /// - Any other value: Defaults to no compression (Codec::Null) -/// -/// # Compression Levels -/// -/// The compression level mapping is based on miniz_oxide's CompressionLevel enum: -/// - Level 0: No compression -/// - Level 1: Best speed (fastest) -/// - Level 9: Best compression (slower, better compression) -/// - Level 10: Uber compression (slowest, best compression) -/// - Other: Default level (balanced speed/compression) pub(crate) fn codec_from_str(codec: Option<&str>, level: Option) -> Codec { use apache_avro::{DeflateSettings, ZstandardSettings}; diff --git a/crates/iceberg/src/spec/table_properties.rs b/crates/iceberg/src/spec/table_properties.rs index d039272c47..dd3484df8a 100644 --- a/crates/iceberg/src/spec/table_properties.rs +++ b/crates/iceberg/src/spec/table_properties.rs @@ -117,7 +117,7 @@ pub struct TableProperties { pub metadata_compression_codec: CompressionCodec, /// Compression codec for Avro files (manifests, manifest lists) pub avro_compression_codec: String, - /// Compression level for Avro files (None uses codec-specific defaults: gzip=9, zstd=1) + /// Compression level for Avro files (None uses codec-specific defaults) pub avro_compression_level: Option, /// Whether to use `FanoutWriter` for partitioned tables. pub write_datafusion_fanout_enabled: bool, From ffcdf764c0f13665ec835a0835876b76337670af Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 11 Dec 2025 01:37:19 +0000 Subject: [PATCH 15/33] put parsing in table properties --- crates/iceberg/src/io/object_cache.rs | 11 +-- crates/iceberg/src/scan/mod.rs | 11 +-- crates/iceberg/src/spec/avro_util.rs | 72 +++++++------------ crates/iceberg/src/spec/manifest/mod.rs | 16 +++-- crates/iceberg/src/spec/manifest/writer.rs | 26 ++++--- crates/iceberg/src/spec/manifest_list/mod.rs | 10 +-- .../iceberg/src/spec/manifest_list/writer.rs | 39 +++++----- crates/iceberg/src/spec/mod.rs | 3 +- crates/iceberg/src/spec/table_properties.rs | 43 +++++------ crates/iceberg/src/transaction/snapshot.rs | 27 +++---- 10 files changed, 112 insertions(+), 146 deletions(-) diff --git a/crates/iceberg/src/io/object_cache.rs b/crates/iceberg/src/io/object_cache.rs index 6f129d23b1..f7b87cd317 100644 --- a/crates/iceberg/src/io/object_cache.rs +++ b/crates/iceberg/src/io/object_cache.rs @@ -212,13 +212,14 @@ mod tests { use tempfile::TempDir; use uuid::Uuid; + use apache_avro::Codec; + use super::*; use crate::TableIdent; use crate::io::{FileIO, OutputFile}; use crate::spec::{ - CompressionSettings, DataContentType, DataFileBuilder, DataFileFormat, Literal, - ManifestEntry, ManifestListWriter, ManifestStatus, ManifestWriterBuilder, Struct, - TableMetadata, + DataContentType, DataFileBuilder, DataFileFormat, Literal, ManifestEntry, + ManifestListWriter, ManifestStatus, ManifestWriterBuilder, Struct, TableMetadata, }; use crate::table::Table; use crate::test_utils::test_runtime; @@ -297,7 +298,7 @@ mod tests { None, current_schema.clone(), current_partition_spec.as_ref().clone(), - CompressionSettings::default(), + Codec::Null, ) .build_v2_data(); writer @@ -335,7 +336,7 @@ mod tests { current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - CompressionSettings::default(), + Codec::Null, ); manifest_list_write .add_manifests(vec![data_file_manifest].into_iter()) diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 2a373b38e5..9db2341ca8 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -634,6 +634,7 @@ pub mod tests { use minijinja::{AutoEscape, Environment, context}; use parquet::arrow::{ArrowWriter, PARQUET_FIELD_ID_META_KEY}; use parquet::basic::Compression; + use apache_avro::Codec; use parquet::file::properties::WriterProperties; use tempfile::TempDir; use uuid::Uuid; @@ -644,7 +645,7 @@ pub mod tests { use crate::metadata_columns::RESERVED_COL_NAME_FILE; use crate::scan::FileScanTask; use crate::spec::{ - CompressionSettings, DEFAULT_SCHEMA_NAME_MAPPING, DataContentType, DataFileBuilder, + DEFAULT_SCHEMA_NAME_MAPPING, DataContentType, DataFileBuilder, DataFileFormat, Datum, Literal, ManifestEntry, ManifestListWriter, ManifestStatus, ManifestWriterBuilder, NestedField, PartitionSpec, PrimitiveType, Schema, Struct, StructType, TableMetadata, Type, @@ -849,7 +850,7 @@ pub mod tests { None, current_schema.clone(), current_partition_spec.as_ref().clone(), - CompressionSettings::default(), + Codec::Null, ) .build_v2_data(); writer @@ -932,7 +933,7 @@ pub mod tests { current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - CompressionSettings::default(), + Codec::Null, ); manifest_list_write .add_manifests(vec![data_file_manifest].into_iter()) @@ -1080,7 +1081,7 @@ pub mod tests { None, current_schema.clone(), current_partition_spec.as_ref().clone(), - CompressionSettings::default(), + Codec::Null, ) .build_v2_data(); @@ -1170,7 +1171,7 @@ pub mod tests { current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - CompressionSettings::default(), + Codec::Null, ); manifest_list_write .add_manifests(vec![data_file_manifest].into_iter()) diff --git a/crates/iceberg/src/spec/avro_util.rs b/crates/iceberg/src/spec/avro_util.rs index e86a5ecd59..dd7d0dce65 100644 --- a/crates/iceberg/src/spec/avro_util.rs +++ b/crates/iceberg/src/spec/avro_util.rs @@ -17,40 +17,23 @@ //! Utilities for working with Apache Avro in Iceberg. -use apache_avro::Codec; +use apache_avro::{Codec, DeflateSettings, ZstandardSettings}; use log::warn; - -use crate::spec::TableProperties; - -/// Settings for compression codec and level. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct CompressionSettings { - /// The compression codec name (e.g., "gzip", "zstd", "snappy", "uncompressed") - pub codec: String, - /// The compression level (None uses codec-specific defaults) - pub level: Option, -} - -impl CompressionSettings { - /// Create a new CompressionSettings with the specified codec and level. - pub fn new(codec: String, level: Option) -> Self { - Self { codec, level } - } - - /// Convert to apache_avro::Codec using the codec_from_str helper function. - pub(crate) fn to_codec(&self) -> Codec { - codec_from_str(Some(&self.codec), self.level) - } -} - -impl Default for CompressionSettings { - fn default() -> Self { - Self { - codec: TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), - level: None, - } - } -} +use miniz_oxide::deflate::CompressionLevel; + +/// Codec name for gzip compression +pub const CODEC_GZIP: &str = "gzip"; +/// Codec name for zstd compression +pub const CODEC_ZSTD: &str = "zstd"; +/// Codec name for snappy compression +pub const CODEC_SNAPPY: &str = "snappy"; +/// Codec name for uncompressed +pub const CODEC_UNCOMPRESSED: &str = "uncompressed"; + +/// Default compression level for gzip (matches Java implementation) +pub const DEFAULT_GZIP_LEVEL: u8 = 9; +/// Default compression level for zstd (matches Java implementation) +pub const DEFAULT_ZSTD_LEVEL: u8 = 1; /// Convert codec name and level to apache_avro::Codec. /// Returns Codec::Null for unknown or unsupported codecs. @@ -75,18 +58,15 @@ impl Default for CompressionSettings { /// - `snappy`: Uses Snappy compression (level parameter ignored) /// - `uncompressed` or `None`: No compression /// - Any other value: Defaults to no compression (Codec::Null) -pub(crate) fn codec_from_str(codec: Option<&str>, level: Option) -> Codec { - use apache_avro::{DeflateSettings, ZstandardSettings}; - +/// Convert codec name and level to apache_avro::Codec. +/// This is public so it can be used when parsing table properties. +pub fn codec_from_str(codec: Option<&str>, level: Option) -> Codec { // Use case-insensitive comparison to match Java implementation match codec.map(|s| s.to_lowercase()).as_deref() { - Some("gzip") => { + Some(c) if c == CODEC_GZIP => { // Map compression level to miniz_oxide::deflate::CompressionLevel // Reference: https://docs.rs/miniz_oxide/latest/miniz_oxide/deflate/enum.CompressionLevel.html - // Default level for gzip/deflate is 9 (BestCompression) to match Java - use miniz_oxide::deflate::CompressionLevel; - - let compression_level = match level.unwrap_or(9) { + let compression_level = match level.unwrap_or(DEFAULT_GZIP_LEVEL) { 0 => CompressionLevel::NoCompression, 1 => CompressionLevel::BestSpeed, 9 => CompressionLevel::BestCompression, @@ -96,14 +76,14 @@ pub(crate) fn codec_from_str(codec: Option<&str>, level: Option) -> Codec { Codec::Deflate(DeflateSettings::new(compression_level)) } - Some("zstd") => { + Some(c) if c == CODEC_ZSTD => { // Zstandard supports levels 0-22, clamp to valid range - // Default level for zstd is 1 to match Java - let zstd_level = level.unwrap_or(1).min(22); + let zstd_level = level.unwrap_or(DEFAULT_ZSTD_LEVEL).min(22); Codec::Zstandard(ZstandardSettings::new(zstd_level)) } - Some("snappy") => Codec::Snappy, - Some("uncompressed") | None => Codec::Null, + Some(c) if c == CODEC_SNAPPY => Codec::Snappy, + Some(c) if c == CODEC_UNCOMPRESSED => Codec::Null, + None => Codec::Null, Some(unknown) => { warn!( "Unrecognized compression codec '{}', using no compression (Codec::Null)", diff --git a/crates/iceberg/src/spec/manifest/mod.rs b/crates/iceberg/src/spec/manifest/mod.rs index 6c9ab1e1af..4f00153762 100644 --- a/crates/iceberg/src/spec/manifest/mod.rs +++ b/crates/iceberg/src/spec/manifest/mod.rs @@ -165,9 +165,11 @@ mod tests { use serde_json::{Value, to_vec}; use tempfile::TempDir; + use apache_avro::Codec; + use super::*; use crate::io::FileIO; - use crate::spec::{CompressionSettings, Literal, NestedField, PrimitiveType, Struct, Transform, Type}; + use crate::spec::{Literal, NestedField, PrimitiveType, Struct, Transform, Type}; #[tokio::test] async fn test_parse_manifest_v2_unpartition() { @@ -273,7 +275,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), - CompressionSettings::default(), + Codec::Null, ) .build_v2_data(); for entry in &entries { @@ -576,7 +578,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), - CompressionSettings::default(), + Codec::Null, ) .build_v2_data(); for entry in &entries { @@ -674,7 +676,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), - CompressionSettings::default(), + Codec::Null, ) .build_v1(); for entry in &entries { @@ -784,7 +786,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), - CompressionSettings::default(), + Codec::Null, ) .build_v1(); for entry in &entries { @@ -893,7 +895,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), - CompressionSettings::default(), + Codec::Null, ) .build_v2_data(); for entry in &entries { @@ -1173,7 +1175,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), - CompressionSettings::default(), + Codec::Null, ) .build_v2_data(); for entry in &entries { diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index 2eb61f8013..d541d74acf 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -19,7 +19,7 @@ use std::cmp::min; use std::future::Future; use std::pin::Pin; -use apache_avro::{Writer as AvroWriter, to_value}; +use apache_avro::{Codec, Writer as AvroWriter, to_value}; use bytes::Bytes; use itertools::Itertools; use serde_json::to_vec; @@ -31,7 +31,6 @@ use super::{ use crate::encryption::EncryptedOutputFile; use crate::error::Result; use crate::io::{FileWrite, OutputFile}; -use crate::spec::avro_util::CompressionSettings; use crate::spec::manifest::_serde::{ManifestEntryV1, ManifestEntryV2}; use crate::spec::manifest::{manifest_schema_v1, manifest_schema_v2}; use crate::spec::{ @@ -54,7 +53,7 @@ pub struct ManifestWriterBuilder { key_metadata: Option>, schema: SchemaRef, partition_spec: PartitionSpec, - compression: CompressionSettings, + compression: Codec, } impl ManifestWriterBuilder { @@ -65,7 +64,7 @@ impl ManifestWriterBuilder { key_metadata: Option>, schema: SchemaRef, partition_spec: PartitionSpec, - compression: CompressionSettings, + compression: Codec, ) -> Self { let location = output.location().to_owned(); Self { @@ -205,7 +204,7 @@ pub struct ManifestWriter { metadata: ManifestMetadata, - compression: CompressionSettings, + compression: Codec, } impl ManifestWriter { @@ -217,7 +216,7 @@ impl ManifestWriter { key_metadata: Option>, metadata: ManifestMetadata, first_row_id: Option, - compression: CompressionSettings, + compression: Codec, ) -> Self { Self { writer_future, @@ -444,9 +443,7 @@ impl ManifestWriter { FormatVersion::V2 | FormatVersion::V3 => manifest_schema_v2(&partition_type)?, }; - let codec = self.compression.to_codec(); - - let mut avro_writer = AvroWriter::with_codec(&avro_schema, Vec::new(), codec); + let mut avro_writer = AvroWriter::with_codec(&avro_schema, Vec::new(), self.compression.clone()); avro_writer.add_user_metadata( "schema".to_string(), to_vec(table_schema).map_err(|err| { @@ -594,6 +591,8 @@ mod tests { use std::fs; use std::sync::Arc; + use apache_avro::DeflateSettings; + use miniz_oxide::deflate::CompressionLevel; use tempfile::TempDir; use super::*; @@ -734,7 +733,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), - CompressionSettings::default(), + Codec::Null, ) .build_v2_data(); writer.add_entry(entries[0].clone()).unwrap(); @@ -823,7 +822,7 @@ mod tests { None, schema.clone(), partition_spec.clone(), - CompressionSettings::default(), + Codec::Null, ) .build_v3_deletes(); @@ -871,14 +870,13 @@ mod tests { let uncompressed_path = tmp_dir.path().join("uncompressed_manifest.avro"); let io = FileIOBuilder::new_fs_io().build().unwrap(); let output_file = io.new_output(uncompressed_path.to_str().unwrap()).unwrap(); - let uncompressed_settings = CompressionSettings::new("uncompressed".to_string(), None); let mut writer = ManifestWriterBuilder::new( output_file, Some(1), None, metadata.schema.clone(), metadata.partition_spec.clone(), - uncompressed_settings, + Codec::Null, ) .build_v2_data(); // Add multiple entries with long paths to create compressible data @@ -911,7 +909,7 @@ mod tests { // Write compressed manifest with gzip let compressed_path = tmp_dir.path().join("compressed_manifest.avro"); let output_file = io.new_output(compressed_path.to_str().unwrap()).unwrap(); - let compression = CompressionSettings::new("gzip".to_string(), Some(9)); + let compression = Codec::Deflate(DeflateSettings::new(CompressionLevel::BestCompression)); let mut writer = ManifestWriterBuilder::new( output_file, Some(1), diff --git a/crates/iceberg/src/spec/manifest_list/mod.rs b/crates/iceberg/src/spec/manifest_list/mod.rs index 4899d534ba..d6d606dc8b 100644 --- a/crates/iceberg/src/spec/manifest_list/mod.rs +++ b/crates/iceberg/src/spec/manifest_list/mod.rs @@ -100,7 +100,9 @@ mod test { use super::_serde::ManifestFileV2; use super::*; use crate::io::FileIO; - use crate::spec::{CompressionSettings, Datum, FieldSummary, ManifestContentType, ManifestFile}; + use apache_avro::Codec; + + use crate::spec::{Datum, FieldSummary, ManifestContentType, ManifestFile}; #[tokio::test] async fn test_parse_manifest_list_v1() { @@ -137,7 +139,7 @@ mod test { file_io.new_output(full_path.clone()).unwrap(), 1646658105718557341, Some(1646658105718557341), - CompressionSettings::default(), + Codec::Null, ); writer @@ -211,7 +213,7 @@ mod test { 1646658105718557341, Some(1646658105718557341), 1, - CompressionSettings::default(), + Codec::Null, ); writer @@ -326,7 +328,7 @@ mod test { Some(377075049360453639), 1, Some(10), - CompressionSettings::default(), + Codec::Null, ); writer diff --git a/crates/iceberg/src/spec/manifest_list/writer.rs b/crates/iceberg/src/spec/manifest_list/writer.rs index 947ac32340..963fdce5c9 100644 --- a/crates/iceberg/src/spec/manifest_list/writer.rs +++ b/crates/iceberg/src/spec/manifest_list/writer.rs @@ -17,7 +17,7 @@ use std::collections::HashMap; -use apache_avro::Writer; +use apache_avro::{Codec, Writer}; use bytes::Bytes; use super::_const_schema::{ @@ -27,7 +27,6 @@ use super::_serde::{ManifestFileV1, ManifestFileV2, ManifestFileV3}; use super::{FormatVersion, ManifestContentType, ManifestFile, UNASSIGNED_SEQUENCE_NUMBER}; use crate::error::Result; use crate::io::OutputFile; -use crate::spec::avro_util::CompressionSettings; use crate::{Error, ErrorKind}; /// A manifest list writer. @@ -61,7 +60,7 @@ impl ManifestListWriter { output_file: OutputFile, snapshot_id: i64, parent_snapshot_id: Option, - compression_settings: CompressionSettings, + compression: Codec, ) -> Self { let mut metadata = HashMap::from_iter([ ("snapshot-id".to_string(), snapshot_id.to_string()), @@ -80,7 +79,7 @@ impl ManifestListWriter { 0, snapshot_id, None, - compression_settings, + compression, ) } @@ -90,7 +89,7 @@ impl ManifestListWriter { snapshot_id: i64, parent_snapshot_id: Option, sequence_number: i64, - compression_settings: CompressionSettings, + compression: Codec, ) -> Self { let mut metadata = HashMap::from_iter([ ("snapshot-id".to_string(), snapshot_id.to_string()), @@ -110,7 +109,7 @@ impl ManifestListWriter { sequence_number, snapshot_id, None, - compression_settings, + compression, ) } @@ -121,7 +120,7 @@ impl ManifestListWriter { parent_snapshot_id: Option, sequence_number: i64, first_row_id: Option, // Always None for delete manifests - compression_settings: CompressionSettings, + compression: Codec, ) -> Self { let mut metadata = HashMap::from_iter([ ("snapshot-id".to_string(), snapshot_id.to_string()), @@ -147,7 +146,7 @@ impl ManifestListWriter { sequence_number, snapshot_id, first_row_id, - compression_settings, + compression, ) } @@ -158,15 +157,14 @@ impl ManifestListWriter { sequence_number: i64, snapshot_id: i64, first_row_id: Option, - compression: CompressionSettings, + compression: Codec, ) -> Self { let avro_schema = match format_version { FormatVersion::V1 => &MANIFEST_LIST_AVRO_SCHEMA_V1, FormatVersion::V2 => &MANIFEST_LIST_AVRO_SCHEMA_V2, FormatVersion::V3 => &MANIFEST_LIST_AVRO_SCHEMA_V3, }; - let codec = compression.to_codec(); - let mut avro_writer = Writer::with_codec(avro_schema, Vec::new(), codec); + let mut avro_writer = Writer::with_codec(avro_schema, Vec::new(), compression); for (key, value) in metadata { avro_writer .add_user_metadata(key, value) @@ -336,12 +334,11 @@ mod test { use tempfile::TempDir; + use apache_avro::Codec; + use super::ManifestListWriter; use crate::io::FileIO; - use crate::spec::{ - CompressionSettings, Datum, FieldSummary, ManifestContentType, ManifestFile, ManifestList, - UNASSIGNED_SEQUENCE_NUMBER, - }; + use crate::spec::{Datum, FieldSummary, ManifestContentType, ManifestFile, ManifestList, UNASSIGNED_SEQUENCE_NUMBER}; #[tokio::test] async fn test_manifest_list_writer_v1() { @@ -373,7 +370,7 @@ mod test { let io = FileIO::new_with_fs(); let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0), CompressionSettings::default()); + let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0), Codec::Null); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -420,7 +417,7 @@ mod test { let io = FileIO::new_with_fs(); let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v2(output_file, snapshot_id, Some(0), seq_num, CompressionSettings::default()); + let mut writer = ManifestListWriter::v2(output_file, snapshot_id, Some(0), seq_num, Codec::Null); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -469,7 +466,7 @@ mod test { let output_file = output_file(&path, &io); let mut writer = - ManifestListWriter::v3(output_file, snapshot_id, Some(0), seq_num, Some(10), CompressionSettings::default()); + ManifestListWriter::v3(output_file, snapshot_id, Some(0), seq_num, Some(10), Codec::Null); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -516,7 +513,7 @@ mod test { let io = FileIO::new_with_fs(); let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0), CompressionSettings::default()); + let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0), Codec::Null); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -561,7 +558,7 @@ mod test { let io = FileIO::new_with_fs(); let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0), CompressionSettings::default()); + let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0), Codec::Null); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -608,7 +605,7 @@ mod test { let io = FileIO::new_with_fs(); let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v2(output_file, snapshot_id, Some(0), seq_num, CompressionSettings::default()); + let mut writer = ManifestListWriter::v2(output_file, snapshot_id, Some(0), seq_num, Codec::Null); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); diff --git a/crates/iceberg/src/spec/mod.rs b/crates/iceberg/src/spec/mod.rs index 60ad8bef76..0b3f3124e9 100644 --- a/crates/iceberg/src/spec/mod.rs +++ b/crates/iceberg/src/spec/mod.rs @@ -17,7 +17,7 @@ //! Spec for Iceberg. -mod avro_util; +pub(crate) mod avro_util; mod datatypes; mod encrypted_key; mod manifest; @@ -38,7 +38,6 @@ mod view_metadata; mod view_metadata_builder; mod view_version; -pub use avro_util::CompressionSettings; pub use datatypes::*; pub use encrypted_key::*; pub use manifest::*; diff --git a/crates/iceberg/src/spec/table_properties.rs b/crates/iceberg/src/spec/table_properties.rs index dd3484df8a..cfccd61c27 100644 --- a/crates/iceberg/src/spec/table_properties.rs +++ b/crates/iceberg/src/spec/table_properties.rs @@ -911,32 +911,23 @@ mod tests { } #[test] - fn test_table_properties_optional_compression_level() { - // Test that compression level is None when not specified - let props = HashMap::new(); - let table_properties = TableProperties::try_from(&props).unwrap(); - assert_eq!(table_properties.avro_compression_level, None); - - // Test that compression level is Some(value) when specified - let props = HashMap::from([( - TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL.to_string(), - "5".to_string(), - )]); + fn test_table_properties_compression_with_level() { + // Test that compression level is used when specified + let props = HashMap::from([ + ( + TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC.to_string(), + "gzip".to_string(), + ), + ( + TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL.to_string(), + "5".to_string(), + ), + ]); let table_properties = TableProperties::try_from(&props).unwrap(); - assert_eq!(table_properties.avro_compression_level, Some(5)); - - // Test that invalid compression level returns error - let props = HashMap::from([( - TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL.to_string(), - "invalid".to_string(), - )]); - let result = TableProperties::try_from(&props); - assert!(result.is_err()); - assert!( - result - .unwrap_err() - .to_string() - .contains("Invalid value for write.avro.compression-level") - ); + // Check that it parsed to a Deflate codec + assert!(matches!( + table_properties.avro_compression_codec, + Codec::Deflate(_) + )); } } diff --git a/crates/iceberg/src/transaction/snapshot.rs b/crates/iceberg/src/transaction/snapshot.rs index 34b9df4774..24a5b19a45 100644 --- a/crates/iceberg/src/transaction/snapshot.rs +++ b/crates/iceberg/src/transaction/snapshot.rs @@ -24,11 +24,12 @@ use futures::stream::FuturesUnordered; use uuid::Uuid; use crate::error::Result; +use crate::spec::avro_util::codec_from_str; use crate::spec::{ - CompressionSettings, DataFile, DataFileFormat, FormatVersion, MAIN_BRANCH, ManifestContentType, - ManifestEntry, ManifestFile, ManifestListWriter, ManifestWriter, ManifestWriterBuilder, - Operation, Snapshot, SnapshotReference, SnapshotRetention, SnapshotSummaryCollector, Struct, - StructType, Summary, TableProperties, update_snapshot_summaries, + DataFile, DataFileFormat, FormatVersion, MAIN_BRANCH, ManifestContentType, ManifestEntry, + ManifestFile, ManifestListWriter, ManifestWriter, ManifestWriterBuilder, Operation, Snapshot, + SnapshotReference, SnapshotRetention, SnapshotSummaryCollector, Struct, StructType, Summary, + TableProperties, update_snapshot_summaries, }; use crate::table::Table; use crate::transaction::ActionCommit; @@ -262,10 +263,6 @@ impl<'a> SnapshotProducer<'a> { ) .with_source(e) })?; - let compression = CompressionSettings::new( - table_props.avro_compression_codec, - table_props.avro_compression_level, - ); let builder = ManifestWriterBuilder::new( output_file, @@ -277,7 +274,7 @@ impl<'a> SnapshotProducer<'a> { .default_partition_spec() .as_ref() .clone(), - compression, + codec_from_str(Some(&table_props.avro_compression_codec), table_props.avro_compression_level), ); match self.table.metadata().format_version() { @@ -469,24 +466,22 @@ impl<'a> SnapshotProducer<'a> { ) .with_source(e) })?; - let compression = CompressionSettings::new( - table_props.avro_compression_codec, - table_props.avro_compression_level, - ); + + let compression = codec_from_str(Some(&table_props.avro_compression_codec), table_props.avro_compression_level); let mut manifest_list_writer = match self.table.metadata().format_version() { FormatVersion::V1 => ManifestListWriter::v1( output_file, self.snapshot_id, self.table.metadata().current_snapshot_id(), - compression, + compression.clone(), ), FormatVersion::V2 => ManifestListWriter::v2( output_file, self.snapshot_id, self.table.metadata().current_snapshot_id(), next_seq_num, - compression, + compression.clone(), ), FormatVersion::V3 => ManifestListWriter::v3( output_file, @@ -494,7 +489,7 @@ impl<'a> SnapshotProducer<'a> { self.table.metadata().current_snapshot_id(), next_seq_num, Some(first_row_id), - compression, + compression.clone(), ), }; From 1eb96192d62756f113a9bea69cd889459d0874a1 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 16 Dec 2025 04:54:33 +0000 Subject: [PATCH 16/33] fmt --- crates/iceberg/src/io/object_cache.rs | 3 +-- crates/iceberg/src/scan/mod.rs | 2 +- crates/iceberg/src/spec/manifest/mod.rs | 2 -- crates/iceberg/src/spec/manifest/writer.rs | 3 ++- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/crates/iceberg/src/io/object_cache.rs b/crates/iceberg/src/io/object_cache.rs index f7b87cd317..017dec048b 100644 --- a/crates/iceberg/src/io/object_cache.rs +++ b/crates/iceberg/src/io/object_cache.rs @@ -207,13 +207,12 @@ impl ObjectCache { mod tests { use std::fs; + use apache_avro::Codec; use minijinja::value::Value; use minijinja::{AutoEscape, Environment, context}; use tempfile::TempDir; use uuid::Uuid; - use apache_avro::Codec; - use super::*; use crate::TableIdent; use crate::io::{FileIO, OutputFile}; diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 9db2341ca8..81ce8320a5 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -624,6 +624,7 @@ pub mod tests { use std::fs::File; use std::sync::Arc; + use apache_avro::Codec; use arrow_array::cast::AsArray; use arrow_array::{ Array, ArrayRef, BooleanArray, Float64Array, Int32Array, Int64Array, RecordBatch, @@ -634,7 +635,6 @@ pub mod tests { use minijinja::{AutoEscape, Environment, context}; use parquet::arrow::{ArrowWriter, PARQUET_FIELD_ID_META_KEY}; use parquet::basic::Compression; - use apache_avro::Codec; use parquet::file::properties::WriterProperties; use tempfile::TempDir; use uuid::Uuid; diff --git a/crates/iceberg/src/spec/manifest/mod.rs b/crates/iceberg/src/spec/manifest/mod.rs index 4f00153762..d360e3962b 100644 --- a/crates/iceberg/src/spec/manifest/mod.rs +++ b/crates/iceberg/src/spec/manifest/mod.rs @@ -165,8 +165,6 @@ mod tests { use serde_json::{Value, to_vec}; use tempfile::TempDir; - use apache_avro::Codec; - use super::*; use crate::io::FileIO; use crate::spec::{Literal, NestedField, PrimitiveType, Struct, Transform, Type}; diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index d541d74acf..c2b6191f38 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -443,7 +443,8 @@ impl ManifestWriter { FormatVersion::V2 | FormatVersion::V3 => manifest_schema_v2(&partition_type)?, }; - let mut avro_writer = AvroWriter::with_codec(&avro_schema, Vec::new(), self.compression.clone()); + let mut avro_writer = + AvroWriter::with_codec(&avro_schema, Vec::new(), self.compression.clone()); avro_writer.add_user_metadata( "schema".to_string(), to_vec(table_schema).map_err(|err| { From 0d519c37439e546eec200dd65843eff55806ecdb Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 16 Dec 2025 05:41:00 +0000 Subject: [PATCH 17/33] remove unneeded tests --- crates/iceberg/src/spec/avro_util.rs | 124 ++------------------ crates/iceberg/src/spec/table_properties.rs | 21 ---- 2 files changed, 7 insertions(+), 138 deletions(-) diff --git a/crates/iceberg/src/spec/avro_util.rs b/crates/iceberg/src/spec/avro_util.rs index dd7d0dce65..700d855c8f 100644 --- a/crates/iceberg/src/spec/avro_util.rs +++ b/crates/iceberg/src/spec/avro_util.rs @@ -96,10 +96,7 @@ pub fn codec_from_str(codec: Option<&str>, level: Option) -> Codec { #[cfg(test)] mod tests { - use std::collections::HashMap; - - use apache_avro::types::Record; - use apache_avro::{DeflateSettings, Schema, Writer, ZstandardSettings}; + use apache_avro::{DeflateSettings, ZstandardSettings}; use miniz_oxide::deflate::CompressionLevel; use super::*; @@ -126,6 +123,12 @@ mod tests { assert_eq!(codec, Codec::Zstandard(ZstandardSettings::new(3))); } + #[test] + fn test_codec_from_str_zstd_clamping() { + let codec = codec_from_str(Some("zstd"), Some(23)); + assert_eq!(codec, Codec::Zstandard(ZstandardSettings::new(22))); + } + #[test] fn test_codec_from_str_uncompressed() { let codec = codec_from_str(Some("uncompressed"), None); @@ -160,117 +163,4 @@ mod tests { let codec = codec_from_str(Some("zstd"), None); assert_eq!(codec, Codec::Zstandard(ZstandardSettings::new(1))); } - - #[test] - fn test_codec_from_str_deflate_levels() { - // Create a simple schema for testing - let schema = Schema::parse_str(r#"{"type": "record", "name": "test", "fields": [{"name": "field", "type": "string"}]}"#).unwrap(); - - // Create test data - let test_str = "test data that should compress differently at different levels. This is a longer string to ensure compression has something to work with. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog."; - - // Test that different compression levels produce different output sizes - let mut sizes = HashMap::new(); - for level in [0, 1, 5, 9, 10] { - let codec = codec_from_str(Some("gzip"), Some(level)); - let mut writer = Writer::with_codec(&schema, Vec::new(), codec); - - let mut record = Record::new(&schema).unwrap(); - record.put("field", test_str); - writer.append(record).unwrap(); - - let encoded = writer.into_inner().unwrap(); - sizes.insert(level, encoded.len()); - } - - // Level 0 (NoCompression) should be largest - // Level 10 (UberCompression) should be smallest or equal to level 9 - assert!(sizes[&0] >= sizes[&1], "Level 0 should be >= level 1"); - assert!( - sizes[&1] >= sizes[&9] || sizes[&1] == sizes[&9], - "Level 1 should be >= level 9" - ); - assert!( - sizes[&9] >= sizes[&10] || sizes[&9] == sizes[&10], - "Level 9 should be >= level 10" - ); - } - - #[test] - fn test_codec_from_str_zstd_levels() { - // Create a simple schema for testing - let schema = Schema::parse_str(r#"{"type": "record", "name": "test", "fields": [{"name": "field", "type": "string"}]}"#).unwrap(); - let test_str = "test data that should compress differently at different levels. This is a longer string to ensure compression has something to work with. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog."; - - // Test various levels by checking they produce valid codecs - for level in [0, 3, 15, 22] { - let codec = codec_from_str(Some("zstd"), Some(level)); - assert!(matches!(codec, Codec::Zstandard(_))); - - // Verify the codec actually works by compressing data - let mut writer = Writer::with_codec(&schema, Vec::new(), codec); - let mut record = Record::new(&schema).unwrap(); - record.put("field", test_str); - writer.append(record).unwrap(); - - let encoded = writer.into_inner().unwrap(); - assert!(!encoded.is_empty(), "Compression should produce output"); - } - - // Test clamping - higher than 22 should be clamped to 22 - let codec_100 = codec_from_str(Some("zstd"), Some(100)); - let codec_22 = codec_from_str(Some("zstd"), Some(22)); - - // Both should work and produce similar results - let mut writer_100 = Writer::with_codec(&schema, Vec::new(), codec_100); - let mut record_100 = Record::new(&schema).unwrap(); - record_100.put("field", test_str); - writer_100.append(record_100).unwrap(); - let encoded_100 = writer_100.into_inner().unwrap(); - - let mut writer_22 = Writer::with_codec(&schema, Vec::new(), codec_22); - let mut record_22 = Record::new(&schema).unwrap(); - record_22.put("field", test_str); - writer_22.append(record_22).unwrap(); - let encoded_22 = writer_22.into_inner().unwrap(); - - // Both should produce the same size since 100 is clamped to 22 - assert_eq!( - encoded_100.len(), - encoded_22.len(), - "Level 100 should be clamped to 22" - ); - } - - #[test] - fn test_compression_level_differences() { - // Create a schema and data that will compress well - let schema = Schema::parse_str(r#"{"type": "record", "name": "test", "fields": [{"name": "field", "type": "string"}]}"#).unwrap(); - - // Use highly compressible data - let test_str = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; - - // Test gzip level 0 (no compression) vs level 9 (best compression) - let codec_0 = codec_from_str(Some("gzip"), Some(0)); - let mut writer_0 = Writer::with_codec(&schema, Vec::new(), codec_0); - let mut record_0 = Record::new(&schema).unwrap(); - record_0.put("field", test_str); - writer_0.append(record_0).unwrap(); - let size_0 = writer_0.into_inner().unwrap().len(); - - let codec_9 = codec_from_str(Some("gzip"), Some(9)); - let mut writer_9 = Writer::with_codec(&schema, Vec::new(), codec_9); - let mut record_9 = Record::new(&schema).unwrap(); - record_9.put("field", test_str); - writer_9.append(record_9).unwrap(); - let size_9 = writer_9.into_inner().unwrap().len(); - - // Level 0 should produce larger output than level 9 for compressible data - assert!( - size_0 > size_9, - "NoCompression (level 0) should produce larger output than BestCompression (level 9): {} vs {}", - size_0, - size_9 - ); - } } diff --git a/crates/iceberg/src/spec/table_properties.rs b/crates/iceberg/src/spec/table_properties.rs index cfccd61c27..adb8222077 100644 --- a/crates/iceberg/src/spec/table_properties.rs +++ b/crates/iceberg/src/spec/table_properties.rs @@ -909,25 +909,4 @@ mod tests { let tp = TableProperties::try_from(&props).unwrap(); assert!(!tp.cdc_enabled); } - - #[test] - fn test_table_properties_compression_with_level() { - // Test that compression level is used when specified - let props = HashMap::from([ - ( - TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC.to_string(), - "gzip".to_string(), - ), - ( - TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL.to_string(), - "5".to_string(), - ), - ]); - let table_properties = TableProperties::try_from(&props).unwrap(); - // Check that it parsed to a Deflate codec - assert!(matches!( - table_properties.avro_compression_codec, - Codec::Deflate(_) - )); - } } From 99bb4e7992c733a34a7a1ccf1a2c69c7940dbd77 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 16 Dec 2025 05:43:34 +0000 Subject: [PATCH 18/33] fix package import --- Cargo.toml | 1 + crates/iceberg/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index f11112109a..bfd36642f4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -111,6 +111,7 @@ log = "0.4.28" metainfo = "0.7.14" mimalloc = "0.1.46" minijinja = "2.12.0" +miniz_oxide = "0.8" mockall = "0.13.1" mockito = "1" motore-macros = "0.4.3" diff --git a/crates/iceberg/Cargo.toml b/crates/iceberg/Cargo.toml index afb961d9dc..cca9ddcbb3 100644 --- a/crates/iceberg/Cargo.toml +++ b/crates/iceberg/Cargo.toml @@ -60,7 +60,7 @@ fnv = { workspace = true } futures = { workspace = true } itertools = { workspace = true } log = { workspace = true } -miniz_oxide = "0.8" +miniz_oxide = { workspace = true } moka = { version = "0.12.10", features = ["future"] } murmur3 = { workspace = true } once_cell = { workspace = true } From e869d66d07f3e1fc8ba836ea1ee2c5aeeddca09f Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 16 Dec 2025 05:56:22 +0000 Subject: [PATCH 19/33] clippy and visibility --- crates/iceberg/src/spec/avro_util.rs | 20 ++++++++++++-------- crates/iceberg/src/spec/manifest/writer.rs | 3 +-- crates/iceberg/src/spec/table_properties.rs | 3 +++ crates/iceberg/src/transaction/snapshot.rs | 6 +++--- 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/crates/iceberg/src/spec/avro_util.rs b/crates/iceberg/src/spec/avro_util.rs index 700d855c8f..41a7fbad01 100644 --- a/crates/iceberg/src/spec/avro_util.rs +++ b/crates/iceberg/src/spec/avro_util.rs @@ -31,9 +31,11 @@ pub const CODEC_SNAPPY: &str = "snappy"; pub const CODEC_UNCOMPRESSED: &str = "uncompressed"; /// Default compression level for gzip (matches Java implementation) -pub const DEFAULT_GZIP_LEVEL: u8 = 9; +const DEFAULT_GZIP_LEVEL: u8 = 9; /// Default compression level for zstd (matches Java implementation) -pub const DEFAULT_ZSTD_LEVEL: u8 = 1; +const DEFAULT_ZSTD_LEVEL: u8 = 1; +/// Max supported level for ZSTD +const MAX_ZSTD_LEVEL: u8 = 22; /// Convert codec name and level to apache_avro::Codec. /// Returns Codec::Null for unknown or unsupported codecs. @@ -48,6 +50,7 @@ pub const DEFAULT_ZSTD_LEVEL: u8 = 1; /// - 10: UberCompression /// - 6: DefaultLevel (balanced speed/compression) /// - Other values: DefaultLevel +/// /// For zstd, level is clamped to valid range (0-22). /// When `None`, uses codec-specific defaults. /// @@ -58,9 +61,7 @@ pub const DEFAULT_ZSTD_LEVEL: u8 = 1; /// - `snappy`: Uses Snappy compression (level parameter ignored) /// - `uncompressed` or `None`: No compression /// - Any other value: Defaults to no compression (Codec::Null) -/// Convert codec name and level to apache_avro::Codec. -/// This is public so it can be used when parsing table properties. -pub fn codec_from_str(codec: Option<&str>, level: Option) -> Codec { +pub(crate) fn codec_from_str(codec: Option<&str>, level: Option) -> Codec { // Use case-insensitive comparison to match Java implementation match codec.map(|s| s.to_lowercase()).as_deref() { Some(c) if c == CODEC_GZIP => { @@ -78,7 +79,7 @@ pub fn codec_from_str(codec: Option<&str>, level: Option) -> Codec { } Some(c) if c == CODEC_ZSTD => { // Zstandard supports levels 0-22, clamp to valid range - let zstd_level = level.unwrap_or(DEFAULT_ZSTD_LEVEL).min(22); + let zstd_level = level.unwrap_or(DEFAULT_ZSTD_LEVEL).min(MAX_ZSTD_LEVEL); Codec::Zstandard(ZstandardSettings::new(zstd_level)) } Some(c) if c == CODEC_SNAPPY => Codec::Snappy, @@ -125,8 +126,11 @@ mod tests { #[test] fn test_codec_from_str_zstd_clamping() { - let codec = codec_from_str(Some("zstd"), Some(23)); - assert_eq!(codec, Codec::Zstandard(ZstandardSettings::new(22))); + let codec = codec_from_str(Some("zstd"), Some(MAX_ZSTD_LEVEL + 1)); + assert_eq!( + codec, + Codec::Zstandard(ZstandardSettings::new(MAX_ZSTD_LEVEL)) + ); } #[test] diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index c2b6191f38..307966d591 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -443,8 +443,7 @@ impl ManifestWriter { FormatVersion::V2 | FormatVersion::V3 => manifest_schema_v2(&partition_type)?, }; - let mut avro_writer = - AvroWriter::with_codec(&avro_schema, Vec::new(), self.compression.clone()); + let mut avro_writer = AvroWriter::with_codec(&avro_schema, Vec::new(), self.compression); avro_writer.add_user_metadata( "schema".to_string(), to_vec(table_schema).map_err(|err| { diff --git a/crates/iceberg/src/spec/table_properties.rs b/crates/iceberg/src/spec/table_properties.rs index adb8222077..8046fbf450 100644 --- a/crates/iceberg/src/spec/table_properties.rs +++ b/crates/iceberg/src/spec/table_properties.rs @@ -419,6 +419,9 @@ impl TryFrom<&HashMap> for TableProperties { #[cfg(test)] mod tests { + use apache_avro::{DeflateSettings, ZstandardSettings}; + use miniz_oxide::deflate::CompressionLevel; + use super::*; use crate::compression::CompressionCodec; diff --git a/crates/iceberg/src/transaction/snapshot.rs b/crates/iceberg/src/transaction/snapshot.rs index 24a5b19a45..f139dd5dd5 100644 --- a/crates/iceberg/src/transaction/snapshot.rs +++ b/crates/iceberg/src/transaction/snapshot.rs @@ -474,14 +474,14 @@ impl<'a> SnapshotProducer<'a> { output_file, self.snapshot_id, self.table.metadata().current_snapshot_id(), - compression.clone(), + compression, ), FormatVersion::V2 => ManifestListWriter::v2( output_file, self.snapshot_id, self.table.metadata().current_snapshot_id(), next_seq_num, - compression.clone(), + compression, ), FormatVersion::V3 => ManifestListWriter::v3( output_file, @@ -489,7 +489,7 @@ impl<'a> SnapshotProducer<'a> { self.table.metadata().current_snapshot_id(), next_seq_num, Some(first_row_id), - compression.clone(), + compression, ), }; From eea31b42eb66d39b01477dbc5f6dcb081bbf8f6c Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 16 Dec 2025 22:39:55 +0000 Subject: [PATCH 20/33] add missing args --- crates/iceberg/src/scan/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 81ce8320a5..7c6534182d 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -1194,6 +1194,7 @@ pub mod tests { None, current_schema.clone(), current_partition_spec.as_ref().clone(), + Codec::Null, ) .build_v2_data(); @@ -1229,6 +1230,7 @@ pub mod tests { None, current_schema.clone(), current_partition_spec.as_ref().clone(), + Codec::Null, ) .build_v2_deletes(); @@ -1268,6 +1270,7 @@ pub mod tests { current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), + Codec::Null, ); manifest_list_write .add_manifests(vec![data_manifest, delete_manifest].into_iter()) From 1f4d485963e48cf723881fef29d004bba1db373c Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 16 Dec 2025 22:54:56 +0000 Subject: [PATCH 21/33] address clippy --- crates/iceberg/src/spec/avro_util.rs | 3 +-- crates/iceberg/src/spec/manifest/writer.rs | 10 +++------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/crates/iceberg/src/spec/avro_util.rs b/crates/iceberg/src/spec/avro_util.rs index 41a7fbad01..3b45db15b3 100644 --- a/crates/iceberg/src/spec/avro_util.rs +++ b/crates/iceberg/src/spec/avro_util.rs @@ -87,8 +87,7 @@ pub(crate) fn codec_from_str(codec: Option<&str>, level: Option) -> Codec { None => Codec::Null, Some(unknown) => { warn!( - "Unrecognized compression codec '{}', using no compression (Codec::Null)", - unknown + "Unrecognized compression codec '{unknown}', using no compression (Codec::Null)" ); Codec::Null } diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index 307966d591..ffcfc00bc4 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -884,8 +884,7 @@ mod tests { let data_file = DataFileBuilder::default() .content(DataContentType::Data) .file_path(format!( - "/very/long/path/to/data/directory/with/many/subdirectories/file_{}.parquet", - i + "/very/long/path/to/data/directory/with/many/subdirectories/file_{i}.parquet" )) .file_format(DataFileFormat::Parquet) .partition(Struct::empty()) @@ -924,8 +923,7 @@ mod tests { let data_file = DataFileBuilder::default() .content(DataContentType::Data) .file_path(format!( - "/very/long/path/to/data/directory/with/many/subdirectories/file_{}.parquet", - i + "/very/long/path/to/data/directory/with/many/subdirectories/file_{i}.parquet" )) .file_format(DataFileFormat::Parquet) .partition(Struct::empty()) @@ -949,9 +947,7 @@ mod tests { // Verify compression is actually working assert!( compressed_size < uncompressed_size, - "Compressed size ({}) should be less than uncompressed size ({})", - compressed_size, - uncompressed_size + "Compressed size ({compressed_size}) should be less than uncompressed size ({uncompressed_size})" ); // Verify the compressed file can be read back correctly From bb68082db46f82cce2ba760c12e90e1ad0e640f8 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 16 Dec 2025 23:02:45 +0000 Subject: [PATCH 22/33] Fmt --- crates/iceberg/src/spec/avro_util.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/crates/iceberg/src/spec/avro_util.rs b/crates/iceberg/src/spec/avro_util.rs index 3b45db15b3..86166129f2 100644 --- a/crates/iceberg/src/spec/avro_util.rs +++ b/crates/iceberg/src/spec/avro_util.rs @@ -86,9 +86,7 @@ pub(crate) fn codec_from_str(codec: Option<&str>, level: Option) -> Codec { Some(c) if c == CODEC_UNCOMPRESSED => Codec::Null, None => Codec::Null, Some(unknown) => { - warn!( - "Unrecognized compression codec '{unknown}', using no compression (Codec::Null)" - ); + warn!("Unrecognized compression codec '{unknown}', using no compression (Codec::Null)"); Codec::Null } } From 313ebe1e229136ab753509ec3d1d70f2d63c619e Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 16 Dec 2025 23:30:15 +0000 Subject: [PATCH 23/33] move use statements to top level --- crates/iceberg/src/scan/mod.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 7c6534182d..c036131814 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -619,7 +619,7 @@ pub mod tests { //! shared tests for the table scan API #![allow(missing_docs)] - use std::collections::HashMap; + use std::collections::{HashMap, HashSet}; use std::fs; use std::fs::File; use std::sync::Arc; @@ -2034,8 +2034,6 @@ pub mod tests { #[tokio::test] async fn test_select_with_file_column() { - use arrow_array::cast::AsArray; - let mut fixture = TableTestFixture::new(); fixture.setup_manifest_files().await; @@ -2157,8 +2155,6 @@ pub mod tests { #[tokio::test] async fn test_file_column_with_multiple_files() { - use std::collections::HashSet; - let mut fixture = TableTestFixture::new(); fixture.setup_manifest_files().await; From 0b9e496f6dfc127bfccd2dc9c797afd54056f040 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 19 Mar 2026 22:18:57 +0000 Subject: [PATCH 24/33] update to use compression codec --- .../iceberg/src/catalog/metadata_location.rs | 18 +- crates/iceberg/src/compression.rs | 97 ++++------ crates/iceberg/src/io/object_cache.rs | 6 +- crates/iceberg/src/puffin/metadata.rs | 2 +- crates/iceberg/src/puffin/mod.rs | 13 +- crates/iceberg/src/puffin/reader.rs | 2 +- crates/iceberg/src/puffin/test_utils.rs | 4 +- crates/iceberg/src/puffin/writer.rs | 8 +- crates/iceberg/src/scan/mod.rs | 16 +- crates/iceberg/src/spec/avro_util.rs | 175 +++++++++++------- crates/iceberg/src/spec/manifest/mod.rs | 13 +- crates/iceberg/src/spec/manifest/writer.rs | 33 ++-- crates/iceberg/src/spec/table_metadata.rs | 4 +- crates/iceberg/src/spec/table_properties.rs | 95 +++++----- crates/iceberg/src/transaction/snapshot.rs | 6 +- 15 files changed, 249 insertions(+), 243 deletions(-) diff --git a/crates/iceberg/src/catalog/metadata_location.rs b/crates/iceberg/src/catalog/metadata_location.rs index acd041d5e1..73333c4f42 100644 --- a/crates/iceberg/src/catalog/metadata_location.rs +++ b/crates/iceberg/src/catalog/metadata_location.rs @@ -114,9 +114,9 @@ impl MetadataLocation { ))?; // Check for compression suffix (e.g., .gz) - let gzip_suffix = CompressionCodec::gzip_default().suffix()?; + let gzip_suffix = CompressionCodec::Gzip(None).suffix()?; let (stripped, compression_codec) = if let Some(s) = stripped.strip_suffix(gzip_suffix) { - (s, CompressionCodec::gzip_default()) + (s, CompressionCodec::Gzip(None)) } else { (stripped, CompressionCodec::None) }; @@ -261,7 +261,7 @@ mod test { table_location: "/abc".to_string(), version: 1234567, id: Uuid::from_str("2cd22b57-5127-4198-92ba-e4e67c79821b").unwrap(), - compression_codec: CompressionCodec::gzip_default(), + compression_codec: CompressionCodec::Gzip(None), }), ), // Negative version @@ -347,14 +347,11 @@ mod test { .unwrap(); assert_eq!( location_gzip.compression_codec, - CompressionCodec::gzip_default() + CompressionCodec::Gzip(None) ); let next_gzip = location_gzip.with_next_version(); - assert_eq!( - next_gzip.compression_codec, - CompressionCodec::gzip_default() - ); + assert_eq!(next_gzip.compression_codec, CompressionCodec::Gzip(None)); assert_eq!(next_gzip.version, 6); } @@ -375,10 +372,7 @@ mod test { ); let metadata_gzip = create_test_metadata(props_gzip); let updated_gzip = location.with_new_metadata(&metadata_gzip); - assert_eq!( - updated_gzip.compression_codec, - CompressionCodec::gzip_default() - ); + assert_eq!(updated_gzip.compression_codec, CompressionCodec::Gzip(None)); assert_eq!(updated_gzip.version, 0); assert_eq!( updated_gzip.to_string(), diff --git a/crates/iceberg/src/compression.rs b/crates/iceberg/src/compression.rs index 929d9226e7..9d1bac752f 100644 --- a/crates/iceberg/src/compression.rs +++ b/crates/iceberg/src/compression.rs @@ -27,11 +27,8 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer}; use crate::{Error, ErrorKind, Result}; -/// Default compression level for Zstandard (zstd). const ZSTD_DEFAULT_LEVEL: u8 = 3; -/// Default compression level for Gzip. const GZIP_DEFAULT_LEVEL: u8 = 6; -/// Maximum compression level for Gzip. const GZIP_MAX_LEVEL: u8 = 9; /// Data compression formats @@ -42,47 +39,24 @@ pub enum CompressionCodec { None, /// LZ4 single compression frame with content size present Lz4, - /// Zstandard single compression frame with content size present. - /// Level range is 0–22, where 0 means default compression level (not no compression). - /// Use [`CompressionCodec::zstd_default`] to construct with the default level. - Zstd(u8), - /// Gzip compression. Level range is 0–9, where 0 means no compression. - /// Use [`CompressionCodec::gzip_default`] to construct with the default level. - Gzip(u8), + /// Zstandard single compression frame with content size present. Optional level 0–22. + Zstd(Option), + /// Gzip compression. Optional level 0–9. + Gzip(Option), /// Snappy compression Snappy, } -impl CompressionCodec { - /// Returns a Zstd codec with the default compression level. - pub const fn zstd_default() -> Self { - CompressionCodec::Zstd(ZSTD_DEFAULT_LEVEL) - } - - /// Returns a Gzip codec with the default compression level. - pub const fn gzip_default() -> Self { - CompressionCodec::Gzip(GZIP_DEFAULT_LEVEL) - } - - /// Returns the codec name as used in serialization and error messages. - pub fn name(&self) -> &'static str { - match self { +impl Serialize for CompressionCodec { + fn serialize(&self, serializer: S) -> std::result::Result { + let s = match self { CompressionCodec::None => "none", CompressionCodec::Lz4 => "lz4", CompressionCodec::Zstd(_) => "zstd", CompressionCodec::Gzip(_) => "gzip", CompressionCodec::Snappy => "snappy", - } - } -} - -// Note: serialize/deserialize do not round-trip the compression level. Iceberg configuration -// only the codec name (e.g. "zstd"), not the level, so deserialization always produces the -// default level. A `Zstd(5)` written to metadata will be read back as `Zstd(3)`. Some -// compression configuration (e.g. Avro metadata) has a separate level field alongside the codec name. -impl Serialize for CompressionCodec { - fn serialize(&self, serializer: S) -> std::result::Result { - serializer.serialize_str(self.name()) + }; + serializer.serialize_str(s) } } @@ -92,8 +66,8 @@ impl<'de> Deserialize<'de> for CompressionCodec { match s.to_lowercase().as_str() { "none" => Ok(CompressionCodec::None), "lz4" => Ok(CompressionCodec::Lz4), - "zstd" => Ok(CompressionCodec::zstd_default()), - "gzip" => Ok(CompressionCodec::gzip_default()), + "zstd" => Ok(CompressionCodec::Zstd(None)), + "gzip" => Ok(CompressionCodec::Gzip(None)), "snappy" => Ok(CompressionCodec::Snappy), other => Err(serde::de::Error::unknown_variant(other, &[ "none", "lz4", "zstd", "gzip", "snappy", @@ -107,8 +81,10 @@ impl fmt::Display for CompressionCodec { match self { CompressionCodec::None => write!(f, "None"), CompressionCodec::Lz4 => write!(f, "Lz4"), - CompressionCodec::Zstd(level) => write!(f, "Zstd(level={level})"), - CompressionCodec::Gzip(level) => write!(f, "Gzip(level={level})"), + CompressionCodec::Zstd(Some(level)) => write!(f, "Zstd(level={level})"), + CompressionCodec::Zstd(None) => write!(f, "Zstd"), + CompressionCodec::Gzip(Some(level)) => write!(f, "Gzip(level={level})"), + CompressionCodec::Gzip(None) => write!(f, "Gzip"), CompressionCodec::Snappy => write!(f, "Snappy"), } } @@ -145,14 +121,16 @@ impl CompressionCodec { )), CompressionCodec::Zstd(level) => { let writer = Vec::::new(); - let mut encoder = zstd::stream::Encoder::new(writer, *level as i32)?; + let mut encoder = + zstd::stream::Encoder::new(writer, level.unwrap_or(ZSTD_DEFAULT_LEVEL) as i32)?; encoder.include_checksum(true)?; encoder.set_pledged_src_size(Some(bytes.len().try_into()?))?; std::io::copy(&mut &bytes[..], &mut encoder)?; Ok(encoder.finish()?) } CompressionCodec::Gzip(level) => { - let compression = Compression::new((*level).min(GZIP_MAX_LEVEL) as u32); + let compression = + Compression::new(level.unwrap_or(GZIP_DEFAULT_LEVEL).min(GZIP_MAX_LEVEL) as u32); let mut encoder = GzEncoder::new(Vec::new(), compression); encoder.write_all(&bytes)?; Ok(encoder.finish()?) @@ -164,6 +142,16 @@ impl CompressionCodec { } } + /// Replace the compression level, if this variant carries one. + /// Variants without a level (`None`, `Lz4`, `Snappy`) are returned unchanged. + pub(crate) fn with_level(self, level: Option) -> Self { + match self { + CompressionCodec::Gzip(_) => CompressionCodec::Gzip(level), + CompressionCodec::Zstd(_) => CompressionCodec::Zstd(level), + other => other, + } + } + pub(crate) fn is_none(&self) -> bool { matches!(self, CompressionCodec::None) } @@ -173,7 +161,7 @@ impl CompressionCodec { /// /// # Errors /// - /// Returns an error for Lz4 and Zstd as they are not fully supported. + /// Returns an error for Lz4, Zstd, and Snappy as they are not fully supported. pub fn suffix(&self) -> Result<&'static str> { match self { CompressionCodec::None => Ok(""), @@ -207,10 +195,7 @@ mod tests { async fn test_compression_codec_compress() { let bytes_vec = [0_u8; 100].to_vec(); - let compression_codecs = [ - CompressionCodec::zstd_default(), - CompressionCodec::gzip_default(), - ]; + let compression_codecs = [CompressionCodec::Zstd(None), CompressionCodec::Gzip(None)]; for codec in compression_codecs { let compressed = codec.compress(bytes_vec.clone()).unwrap(); @@ -244,16 +229,16 @@ mod tests { #[test] fn test_suffix() { assert_eq!(CompressionCodec::None.suffix().unwrap(), ""); - assert_eq!(CompressionCodec::gzip_default().suffix().unwrap(), ".gz"); + assert_eq!(CompressionCodec::Gzip(None).suffix().unwrap(), ".gz"); assert!(CompressionCodec::Lz4.suffix().is_err()); - assert!(CompressionCodec::zstd_default().suffix().is_err()); + assert!(CompressionCodec::Zstd(None).suffix().is_err()); assert!(CompressionCodec::Snappy.suffix().is_err()); let lz4_err = CompressionCodec::Lz4.suffix().unwrap_err(); assert!(lz4_err.to_string().contains("suffix not defined for Lz4")); - let zstd_err = CompressionCodec::zstd_default().suffix().unwrap_err(); + let zstd_err = CompressionCodec::Zstd(None).suffix().unwrap_err(); assert!(zstd_err.to_string().contains("suffix not defined for Zstd")); } @@ -261,16 +246,10 @@ mod tests { fn test_display() { assert_eq!(CompressionCodec::None.to_string(), "None"); assert_eq!(CompressionCodec::Lz4.to_string(), "Lz4"); - assert_eq!( - CompressionCodec::zstd_default().to_string(), - "Zstd(level=3)" - ); - assert_eq!(CompressionCodec::Zstd(5).to_string(), "Zstd(level=5)"); - assert_eq!( - CompressionCodec::gzip_default().to_string(), - "Gzip(level=6)" - ); - assert_eq!(CompressionCodec::Gzip(9).to_string(), "Gzip(level=9)"); + assert_eq!(CompressionCodec::Zstd(None).to_string(), "Zstd"); + assert_eq!(CompressionCodec::Zstd(Some(5)).to_string(), "Zstd(level=5)"); + assert_eq!(CompressionCodec::Gzip(None).to_string(), "Gzip"); + assert_eq!(CompressionCodec::Gzip(Some(9)).to_string(), "Gzip(level=9)"); assert_eq!(CompressionCodec::Snappy.to_string(), "Snappy"); } } diff --git a/crates/iceberg/src/io/object_cache.rs b/crates/iceberg/src/io/object_cache.rs index 017dec048b..6164931201 100644 --- a/crates/iceberg/src/io/object_cache.rs +++ b/crates/iceberg/src/io/object_cache.rs @@ -207,7 +207,6 @@ impl ObjectCache { mod tests { use std::fs; - use apache_avro::Codec; use minijinja::value::Value; use minijinja::{AutoEscape, Environment, context}; use tempfile::TempDir; @@ -215,6 +214,7 @@ mod tests { use super::*; use crate::TableIdent; + use crate::compression::CompressionCodec; use crate::io::{FileIO, OutputFile}; use crate::spec::{ DataContentType, DataFileBuilder, DataFileFormat, Literal, ManifestEntry, @@ -297,7 +297,7 @@ mod tests { None, current_schema.clone(), current_partition_spec.as_ref().clone(), - Codec::Null, + CompressionCodec::None, ) .build_v2_data(); writer @@ -335,7 +335,7 @@ mod tests { current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - Codec::Null, + CompressionCodec::None, ); manifest_list_write .add_manifests(vec![data_file_manifest].into_iter()) diff --git a/crates/iceberg/src/puffin/metadata.rs b/crates/iceberg/src/puffin/metadata.rs index 10cfd265da..4c09a07876 100644 --- a/crates/iceberg/src/puffin/metadata.rs +++ b/crates/iceberg/src/puffin/metadata.rs @@ -1024,7 +1024,7 @@ mod tests { assert_eq!(metadata.blobs.len(), 1); assert_eq!( metadata.blobs[0].compression_codec, - CompressionCodec::gzip_default() + CompressionCodec::Gzip(None) ); } } diff --git a/crates/iceberg/src/puffin/mod.rs b/crates/iceberg/src/puffin/mod.rs index 0e054cac51..12f2aef2a1 100644 --- a/crates/iceberg/src/puffin/mod.rs +++ b/crates/iceberg/src/puffin/mod.rs @@ -34,11 +34,7 @@ fn validate_puffin_compression(codec: CompressionCodec) -> Result<()> { other => Err(Error::new( ErrorKind::DataInvalid, format!( - "Compression codec {} is not supported for Puffin files. Only {}, {}, and {} are supported.", - other.name(), - CompressionCodec::None.name(), - CompressionCodec::Lz4.name(), - CompressionCodec::zstd_default().name() + "{other:?} is not supported for Puffin files. Only None, Lz4, and Zstd are supported." ), )), } @@ -65,10 +61,11 @@ mod tests { // Supported codecs assert!(validate_puffin_compression(CompressionCodec::None).is_ok()); assert!(validate_puffin_compression(CompressionCodec::Lz4).is_ok()); - assert!(validate_puffin_compression(CompressionCodec::zstd_default()).is_ok()); - assert!(validate_puffin_compression(CompressionCodec::Zstd(5)).is_ok()); + assert!(validate_puffin_compression(CompressionCodec::Zstd(None)).is_ok()); + assert!(validate_puffin_compression(CompressionCodec::Zstd(Some(3))).is_ok()); // Unsupported codecs - assert!(validate_puffin_compression(CompressionCodec::gzip_default()).is_err()); + assert!(validate_puffin_compression(CompressionCodec::Gzip(None)).is_err()); + assert!(validate_puffin_compression(CompressionCodec::Snappy).is_err()); } } diff --git a/crates/iceberg/src/puffin/reader.rs b/crates/iceberg/src/puffin/reader.rs index 0aced4186f..c3d90e71f6 100644 --- a/crates/iceberg/src/puffin/reader.rs +++ b/crates/iceberg/src/puffin/reader.rs @@ -144,7 +144,7 @@ mod tests { sequence_number: 1, offset: 4, length: 10, - compression_codec: CompressionCodec::gzip_default(), + compression_codec: CompressionCodec::Gzip(None), properties: HashMap::new(), }; diff --git a/crates/iceberg/src/puffin/test_utils.rs b/crates/iceberg/src/puffin/test_utils.rs index e0844e2002..21e132b2f7 100644 --- a/crates/iceberg/src/puffin/test_utils.rs +++ b/crates/iceberg/src/puffin/test_utils.rs @@ -77,7 +77,7 @@ pub(crate) fn zstd_compressed_metric_blob_0_metadata() -> BlobMetadata { sequence_number: METRIC_BLOB_0_SEQUENCE_NUMBER, offset: 4, length: 22, - compression_codec: CompressionCodec::zstd_default(), + compression_codec: CompressionCodec::Zstd(None), properties: HashMap::new(), } } @@ -134,7 +134,7 @@ pub(crate) fn zstd_compressed_metric_blob_1_metadata() -> BlobMetadata { sequence_number: METRIC_BLOB_1_SEQUENCE_NUMBER, offset: 26, length: 77, - compression_codec: CompressionCodec::zstd_default(), + compression_codec: CompressionCodec::Zstd(None), properties: HashMap::new(), } } diff --git a/crates/iceberg/src/puffin/writer.rs b/crates/iceberg/src/puffin/writer.rs index 4af4970b04..718ee2740c 100644 --- a/crates/iceberg/src/puffin/writer.rs +++ b/crates/iceberg/src/puffin/writer.rs @@ -252,7 +252,7 @@ mod tests { let temp_dir = TempDir::new().unwrap(); let blobs = vec![blob_0(), blob_1()]; let blobs_with_compression = - blobs_with_compression(blobs.clone(), CompressionCodec::zstd_default()); + blobs_with_compression(blobs.clone(), CompressionCodec::Zstd(None)); let input_file = write_puffin_file(&temp_dir, blobs_with_compression, file_properties()) .await @@ -324,8 +324,7 @@ mod tests { async fn test_zstd_compressed_metric_data_is_bit_identical_to_java_generated_file() { let temp_dir = TempDir::new().unwrap(); let blobs = vec![blob_0(), blob_1()]; - let blobs_with_compression = - blobs_with_compression(blobs, CompressionCodec::zstd_default()); + let blobs_with_compression = blobs_with_compression(blobs, CompressionCodec::Zstd(None)); assert_files_are_bit_identical( write_puffin_file(&temp_dir, blobs_with_compression, file_properties()) @@ -340,8 +339,7 @@ mod tests { async fn test_gzip_compression_rejected() { let temp_dir = TempDir::new().unwrap(); let blobs = vec![blob_0()]; - let blobs_with_compression = - blobs_with_compression(blobs, CompressionCodec::gzip_default()); + let blobs_with_compression = blobs_with_compression(blobs, CompressionCodec::Gzip(None)); let result = write_puffin_file(&temp_dir, blobs_with_compression, file_properties()).await; diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index c036131814..3e427db615 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -624,7 +624,6 @@ pub mod tests { use std::fs::File; use std::sync::Arc; - use apache_avro::Codec; use arrow_array::cast::AsArray; use arrow_array::{ Array, ArrayRef, BooleanArray, Float64Array, Int32Array, Int64Array, RecordBatch, @@ -640,6 +639,7 @@ pub mod tests { use uuid::Uuid; use crate::arrow::ArrowReaderBuilder; + use crate::compression::CompressionCodec; use crate::expr::{BoundPredicate, Reference}; use crate::io::{FileIO, OutputFile}; use crate::metadata_columns::RESERVED_COL_NAME_FILE; @@ -850,7 +850,7 @@ pub mod tests { None, current_schema.clone(), current_partition_spec.as_ref().clone(), - Codec::Null, + CompressionCodec::None, ) .build_v2_data(); writer @@ -933,7 +933,7 @@ pub mod tests { current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - Codec::Null, + CompressionCodec::None, ); manifest_list_write .add_manifests(vec![data_file_manifest].into_iter()) @@ -1081,7 +1081,7 @@ pub mod tests { None, current_schema.clone(), current_partition_spec.as_ref().clone(), - Codec::Null, + CompressionCodec::None, ) .build_v2_data(); @@ -1171,7 +1171,7 @@ pub mod tests { current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - Codec::Null, + CompressionCodec::None, ); manifest_list_write .add_manifests(vec![data_file_manifest].into_iter()) @@ -1194,7 +1194,7 @@ pub mod tests { None, current_schema.clone(), current_partition_spec.as_ref().clone(), - Codec::Null, + CompressionCodec::None, ) .build_v2_data(); @@ -1230,7 +1230,7 @@ pub mod tests { None, current_schema.clone(), current_partition_spec.as_ref().clone(), - Codec::Null, + CompressionCodec::None, ) .build_v2_deletes(); @@ -1270,7 +1270,7 @@ pub mod tests { current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - Codec::Null, + CompressionCodec::None, ); manifest_list_write .add_manifests(vec![data_manifest, delete_manifest].into_iter()) diff --git a/crates/iceberg/src/spec/avro_util.rs b/crates/iceberg/src/spec/avro_util.rs index 86166129f2..d25c2042ff 100644 --- a/crates/iceberg/src/spec/avro_util.rs +++ b/crates/iceberg/src/spec/avro_util.rs @@ -20,6 +20,9 @@ use apache_avro::{Codec, DeflateSettings, ZstandardSettings}; use log::warn; use miniz_oxide::deflate::CompressionLevel; +use serde_json::Value; + +use crate::compression::CompressionCodec; /// Codec name for gzip compression pub const CODEC_GZIP: &str = "gzip"; @@ -37,36 +40,39 @@ const DEFAULT_ZSTD_LEVEL: u8 = 1; /// Max supported level for ZSTD const MAX_ZSTD_LEVEL: u8 = 22; -/// Convert codec name and level to apache_avro::Codec. -/// Returns Codec::Null for unknown or unsupported codecs. +/// Parse codec name and optional level into a [`CompressionCodec`]. +/// Returns `CompressionCodec::None` for unknown or unsupported codecs. /// /// # Arguments /// /// * `codec` - The name of the compression codec (e.g., "gzip", "zstd", "snappy", "uncompressed") -/// * `level` - The compression level. For gzip/deflate: -/// - 0: NoCompression -/// - 1: BestSpeed -/// - 9: BestCompression -/// - 10: UberCompression -/// - 6: DefaultLevel (balanced speed/compression) -/// - Other values: DefaultLevel -/// -/// For zstd, level is clamped to valid range (0-22). -/// When `None`, uses codec-specific defaults. -/// -/// # Supported Codecs -/// -/// - `gzip`: Uses Deflate compression with specified level -/// - `zstd`: Uses Zstandard compression (level clamped to valid zstd range 0-22) -/// - `snappy`: Uses Snappy compression (level parameter ignored) -/// - `uncompressed` or `None`: No compression -/// - Any other value: Defaults to no compression (Codec::Null) -pub(crate) fn codec_from_str(codec: Option<&str>, level: Option) -> Codec { - // Use case-insensitive comparison to match Java implementation - match codec.map(|s| s.to_lowercase()).as_deref() { - Some(c) if c == CODEC_GZIP => { - // Map compression level to miniz_oxide::deflate::CompressionLevel - // Reference: https://docs.rs/miniz_oxide/latest/miniz_oxide/deflate/enum.CompressionLevel.html +/// * `level` - Optional compression level stored as-is; codec-specific defaults are applied +/// in [`to_avro_codec`] at the point of use. +pub(crate) fn parse_avro_codec(codec: Option<&str>, level: Option) -> CompressionCodec { + let Some(codec_str) = codec else { + return CompressionCodec::None; + }; + let lowercase = codec_str.to_lowercase(); + // "uncompressed" is Avro-specific and not in the standard CompressionCodec names + if lowercase == CODEC_UNCOMPRESSED { + return CompressionCodec::None; + } + let base: CompressionCodec = + serde_json::from_value(Value::String(lowercase)).unwrap_or_else(|_| { + warn!("Unrecognized compression codec '{codec_str}', using no compression"); + CompressionCodec::None + }); + base.with_level(level) +} + +/// Convert a [`CompressionCodec`] to an [`apache_avro::Codec`] for use in Avro writers. +/// Codec-specific defaults are applied here (e.g. gzip defaults to level 9, zstd to level 1). +pub(crate) fn to_avro_codec(codec: CompressionCodec) -> Codec { + match codec { + CompressionCodec::None => Codec::Null, + CompressionCodec::Snappy => Codec::Snappy, + CompressionCodec::Lz4 => Codec::Null, + CompressionCodec::Gzip(level) => { let compression_level = match level.unwrap_or(DEFAULT_GZIP_LEVEL) { 0 => CompressionLevel::NoCompression, 1 => CompressionLevel::BestSpeed, @@ -74,21 +80,12 @@ pub(crate) fn codec_from_str(codec: Option<&str>, level: Option) -> Codec { 10 => CompressionLevel::UberCompression, _ => CompressionLevel::DefaultLevel, }; - Codec::Deflate(DeflateSettings::new(compression_level)) } - Some(c) if c == CODEC_ZSTD => { - // Zstandard supports levels 0-22, clamp to valid range + CompressionCodec::Zstd(level) => { let zstd_level = level.unwrap_or(DEFAULT_ZSTD_LEVEL).min(MAX_ZSTD_LEVEL); Codec::Zstandard(ZstandardSettings::new(zstd_level)) } - Some(c) if c == CODEC_SNAPPY => Codec::Snappy, - Some(c) if c == CODEC_UNCOMPRESSED => Codec::Null, - None => Codec::Null, - Some(unknown) => { - warn!("Unrecognized compression codec '{unknown}', using no compression (Codec::Null)"); - Codec::Null - } } } @@ -100,68 +97,106 @@ mod tests { use super::*; #[test] - fn test_codec_from_str_gzip() { + fn test_parse_avro_codec_gzip() { // Test with mixed case to verify case-insensitive matching - let codec = codec_from_str(Some("GZip"), Some(5)); - assert_eq!( - codec, - Codec::Deflate(DeflateSettings::new(CompressionLevel::DefaultLevel)) - ); + let codec = parse_avro_codec(Some("GZip"), Some(5)); + assert_eq!(codec, CompressionCodec::Gzip(Some(5))); + } + + #[test] + fn test_parse_avro_codec_snappy() { + let codec = parse_avro_codec(Some("snappy"), None); + assert_eq!(codec, CompressionCodec::Snappy); } #[test] - fn test_codec_from_str_snappy() { - let codec = codec_from_str(Some("snappy"), None); - assert_eq!(codec, Codec::Snappy); + fn test_parse_avro_codec_zstd() { + let codec = parse_avro_codec(Some("zstd"), Some(3)); + assert_eq!(codec, CompressionCodec::Zstd(Some(3))); } #[test] - fn test_codec_from_str_zstd() { - let codec = codec_from_str(Some("zstd"), Some(3)); - assert_eq!(codec, Codec::Zstandard(ZstandardSettings::new(3))); + fn test_parse_avro_codec_uncompressed() { + let codec = parse_avro_codec(Some("uncompressed"), None); + assert_eq!(codec, CompressionCodec::None); } #[test] - fn test_codec_from_str_zstd_clamping() { - let codec = codec_from_str(Some("zstd"), Some(MAX_ZSTD_LEVEL + 1)); + fn test_parse_avro_codec_null() { + let codec = parse_avro_codec(None, None); + assert_eq!(codec, CompressionCodec::None); + } + + #[test] + fn test_parse_avro_codec_unknown() { + let codec = parse_avro_codec(Some("unknown"), Some(1)); + assert_eq!(codec, CompressionCodec::None); + } + + #[test] + fn test_parse_avro_codec_gzip_no_level() { + // Level is stored as-is (None), default applied in to_avro_codec + let codec = parse_avro_codec(Some("gzip"), None); + assert_eq!(codec, CompressionCodec::Gzip(None)); + } + + #[test] + fn test_parse_avro_codec_zstd_no_level() { + let codec = parse_avro_codec(Some("zstd"), None); + assert_eq!(codec, CompressionCodec::Zstd(None)); + } + + #[test] + fn test_to_avro_codec_gzip_default() { + // None level → default 9 (BestCompression) + let avro_codec = to_avro_codec(CompressionCodec::Gzip(None)); assert_eq!( - codec, - Codec::Zstandard(ZstandardSettings::new(MAX_ZSTD_LEVEL)) + avro_codec, + Codec::Deflate(DeflateSettings::new(CompressionLevel::BestCompression)) ); } #[test] - fn test_codec_from_str_uncompressed() { - let codec = codec_from_str(Some("uncompressed"), None); - assert!(matches!(codec, Codec::Null)); + fn test_to_avro_codec_gzip_level5() { + let avro_codec = to_avro_codec(CompressionCodec::Gzip(Some(5))); + assert_eq!( + avro_codec, + Codec::Deflate(DeflateSettings::new(CompressionLevel::DefaultLevel)) + ); } #[test] - fn test_codec_from_str_null() { - let codec = codec_from_str(None, None); - assert!(matches!(codec, Codec::Null)); + fn test_to_avro_codec_zstd_default() { + // None level → default 1 + let avro_codec = to_avro_codec(CompressionCodec::Zstd(None)); + assert_eq!(avro_codec, Codec::Zstandard(ZstandardSettings::new(1))); } #[test] - fn test_codec_from_str_unknown() { - let codec = codec_from_str(Some("unknown"), Some(1)); - assert!(matches!(codec, Codec::Null)); + fn test_to_avro_codec_zstd_level3() { + let avro_codec = to_avro_codec(CompressionCodec::Zstd(Some(3))); + assert_eq!(avro_codec, Codec::Zstandard(ZstandardSettings::new(3))); } #[test] - fn test_codec_from_str_gzip_default_level() { - // Test that None level defaults to 9 for gzip - let codec = codec_from_str(Some("gzip"), None); + fn test_to_avro_codec_zstd_clamping() { + let avro_codec = to_avro_codec(CompressionCodec::Zstd(Some(MAX_ZSTD_LEVEL + 1))); assert_eq!( - codec, - Codec::Deflate(DeflateSettings::new(CompressionLevel::BestCompression)) + avro_codec, + Codec::Zstandard(ZstandardSettings::new(MAX_ZSTD_LEVEL)) ); } #[test] - fn test_codec_from_str_zstd_default_level() { - // Test that None level defaults to 1 for zstd - let codec = codec_from_str(Some("zstd"), None); - assert_eq!(codec, Codec::Zstandard(ZstandardSettings::new(1))); + fn test_to_avro_codec_null() { + assert!(matches!(to_avro_codec(CompressionCodec::None), Codec::Null)); + } + + #[test] + fn test_to_avro_codec_snappy() { + assert!(matches!( + to_avro_codec(CompressionCodec::Snappy), + Codec::Snappy + )); } } diff --git a/crates/iceberg/src/spec/manifest/mod.rs b/crates/iceberg/src/spec/manifest/mod.rs index d360e3962b..098e70f7ea 100644 --- a/crates/iceberg/src/spec/manifest/mod.rs +++ b/crates/iceberg/src/spec/manifest/mod.rs @@ -166,6 +166,7 @@ mod tests { use tempfile::TempDir; use super::*; + use crate::compression::CompressionCodec; use crate::io::FileIO; use crate::spec::{Literal, NestedField, PrimitiveType, Struct, Transform, Type}; @@ -273,7 +274,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), - Codec::Null, + CompressionCodec::None, ) .build_v2_data(); for entry in &entries { @@ -576,7 +577,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), - Codec::Null, + CompressionCodec::None, ) .build_v2_data(); for entry in &entries { @@ -674,7 +675,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), - Codec::Null, + CompressionCodec::None, ) .build_v1(); for entry in &entries { @@ -784,7 +785,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), - Codec::Null, + CompressionCodec::None, ) .build_v1(); for entry in &entries { @@ -893,7 +894,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), - Codec::Null, + CompressionCodec::None, ) .build_v2_data(); for entry in &entries { @@ -1173,7 +1174,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), - Codec::Null, + CompressionCodec::None, ) .build_v2_data(); for entry in &entries { diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index ffcfc00bc4..0280db063f 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -19,7 +19,7 @@ use std::cmp::min; use std::future::Future; use std::pin::Pin; -use apache_avro::{Codec, Writer as AvroWriter, to_value}; +use apache_avro::{Writer as AvroWriter, to_value}; use bytes::Bytes; use itertools::Itertools; use serde_json::to_vec; @@ -28,14 +28,14 @@ use super::{ Datum, FormatVersion, ManifestContentType, PartitionSpec, PrimitiveType, UNASSIGNED_SEQUENCE_NUMBER, }; -use crate::encryption::EncryptedOutputFile; +use crate::compression::CompressionCodec; use crate::error::Result; use crate::io::{FileWrite, OutputFile}; use crate::spec::manifest::_serde::{ManifestEntryV1, ManifestEntryV2}; use crate::spec::manifest::{manifest_schema_v1, manifest_schema_v2}; use crate::spec::{ DataContentType, DataFile, FieldSummary, ManifestEntry, ManifestFile, ManifestMetadata, - ManifestStatus, PrimitiveLiteral, SchemaRef, StructType, + ManifestStatus, PrimitiveLiteral, SchemaRef, StructType, avro_util, }; use crate::{Error, ErrorKind}; @@ -53,7 +53,7 @@ pub struct ManifestWriterBuilder { key_metadata: Option>, schema: SchemaRef, partition_spec: PartitionSpec, - compression: Codec, + compression: CompressionCodec, } impl ManifestWriterBuilder { @@ -64,7 +64,7 @@ impl ManifestWriterBuilder { key_metadata: Option>, schema: SchemaRef, partition_spec: PartitionSpec, - compression: Codec, + compression: CompressionCodec, ) -> Self { let location = output.location().to_owned(); Self { @@ -204,7 +204,7 @@ pub struct ManifestWriter { metadata: ManifestMetadata, - compression: Codec, + compression: CompressionCodec, } impl ManifestWriter { @@ -216,7 +216,7 @@ impl ManifestWriter { key_metadata: Option>, metadata: ManifestMetadata, first_row_id: Option, - compression: Codec, + compression: CompressionCodec, ) -> Self { Self { writer_future, @@ -443,7 +443,11 @@ impl ManifestWriter { FormatVersion::V2 | FormatVersion::V3 => manifest_schema_v2(&partition_type)?, }; - let mut avro_writer = AvroWriter::with_codec(&avro_schema, Vec::new(), self.compression); + let mut avro_writer = AvroWriter::with_codec( + &avro_schema, + Vec::new(), + avro_util::to_avro_codec(self.compression), + ); avro_writer.add_user_metadata( "schema".to_string(), to_vec(table_schema).map_err(|err| { @@ -591,12 +595,11 @@ mod tests { use std::fs; use std::sync::Arc; - use apache_avro::DeflateSettings; - use miniz_oxide::deflate::CompressionLevel; use tempfile::TempDir; use super::*; - use crate::io::{FileIO, FileIOBuilder}; + use crate::compression::CompressionCodec; + use crate::io::FileIO; use crate::spec::{ DataContentType, DataFileBuilder, DataFileFormat, Manifest, ManifestContentType, ManifestEntry, ManifestMetadata, ManifestStatus, NestedField, PartitionSpec, PrimitiveType, @@ -733,7 +736,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), - Codec::Null, + CompressionCodec::None, ) .build_v2_data(); writer.add_entry(entries[0].clone()).unwrap(); @@ -868,7 +871,7 @@ mod tests { // Write uncompressed manifest with multiple entries to make compression effective let tmp_dir = TempDir::new().unwrap(); let uncompressed_path = tmp_dir.path().join("uncompressed_manifest.avro"); - let io = FileIOBuilder::new_fs_io().build().unwrap(); + let io = FileIO::new_with_fs(); let output_file = io.new_output(uncompressed_path.to_str().unwrap()).unwrap(); let mut writer = ManifestWriterBuilder::new( output_file, @@ -876,7 +879,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), - Codec::Null, + CompressionCodec::None, ) .build_v2_data(); // Add multiple entries with long paths to create compressible data @@ -908,7 +911,7 @@ mod tests { // Write compressed manifest with gzip let compressed_path = tmp_dir.path().join("compressed_manifest.avro"); let output_file = io.new_output(compressed_path.to_str().unwrap()).unwrap(); - let compression = Codec::Deflate(DeflateSettings::new(CompressionLevel::BestCompression)); + let compression = CompressionCodec::Gzip(Some(9)); let mut writer = ManifestWriterBuilder::new( output_file, Some(1), diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index 2a51ef7f0a..786a5d3d35 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -460,7 +460,7 @@ impl TableMetadata { && metadata_content[0] == 0x1F && metadata_content[1] == 0x8B { - let decompressed_data = CompressionCodec::gzip_default() + let decompressed_data = CompressionCodec::Gzip(None) .decompress(metadata_content.to_vec()) .map_err(|e| { Error::new( @@ -3633,7 +3633,7 @@ mod tests { let original_metadata: TableMetadata = get_test_table_metadata("TableMetadataV2Valid.json"); let json = serde_json::to_string(&original_metadata).unwrap(); - let compressed = CompressionCodec::gzip_default() + let compressed = CompressionCodec::Gzip(None) .compress(json.into_bytes()) .expect("failed to compress metadata"); std::fs::write(&metadata_location, &compressed).expect("failed to write metadata"); diff --git a/crates/iceberg/src/spec/table_properties.rs b/crates/iceberg/src/spec/table_properties.rs index 8046fbf450..5785e5c58c 100644 --- a/crates/iceberg/src/spec/table_properties.rs +++ b/crates/iceberg/src/spec/table_properties.rs @@ -21,6 +21,7 @@ use std::str::FromStr; use crate::compression::CompressionCodec; use crate::error::{Error, ErrorKind, Result}; +use crate::spec::avro_util; fn parse_property( properties: &HashMap, @@ -76,9 +77,7 @@ pub(crate) fn parse_metadata_file_compression( Error::new( ErrorKind::DataInvalid, format!( - "Invalid metadata compression codec: {value}. Only '{}' and '{}' are supported.", - CompressionCodec::None.name(), - CompressionCodec::gzip_default().name() + "Invalid metadata compression codec: {value}. Only 'none' and 'gzip' are supported." ), ) })?; @@ -89,9 +88,7 @@ pub(crate) fn parse_metadata_file_compression( _ => Err(Error::new( ErrorKind::DataInvalid, format!( - "Invalid metadata compression codec: {value}. Only '{}' and '{}' are supported for metadata files.", - CompressionCodec::None.name(), - CompressionCodec::gzip_default().name() + "Invalid metadata compression codec: {value}. Only 'none' and 'gzip' are supported for metadata files." ), )), } @@ -113,12 +110,10 @@ pub struct TableProperties { pub write_format_default: String, /// The target file size for files. pub write_target_file_size_bytes: usize, + /// Compression codec for Avro files (manifests, manifest lists) + pub avro_compression_codec: CompressionCodec, /// Compression codec for metadata files (JSON) pub metadata_compression_codec: CompressionCodec, - /// Compression codec for Avro files (manifests, manifest lists) - pub avro_compression_codec: String, - /// Compression level for Avro files (None uses codec-specific defaults) - pub avro_compression_level: Option, /// Whether to use `FanoutWriter` for partitioned tables. pub write_datafusion_fanout_enabled: bool, /// Whether garbage collection is enabled on drop. @@ -340,26 +335,24 @@ impl TryFrom<&HashMap> for TableProperties { TableProperties::PROPERTY_WRITE_TARGET_FILE_SIZE_BYTES, TableProperties::PROPERTY_WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT, )?, + avro_compression_codec: { + let codec_name = parse_property( + props, + TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC, + TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), + )?; + + // Sentinel value 255 means level was not specified + let level_raw = parse_property( + props, + TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL, + 255u8, + )?; + let level = if level_raw == 255 { None } else { Some(level_raw) }; + + avro_util::parse_avro_codec(Some(&codec_name), level) + }, metadata_compression_codec: parse_metadata_file_compression(props)?, - avro_compression_codec: parse_property( - props, - TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC, - TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), - )?, - avro_compression_level: props - .get(TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL) - .map(|v| { - v.parse::().map_err(|e| { - Error::new( - ErrorKind::DataInvalid, - format!( - "Invalid value for {}: {e}", - TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL - ), - ) - }) - }) - .transpose()?, write_datafusion_fanout_enabled: parse_property( props, TableProperties::PROPERTY_DATAFUSION_WRITE_FANOUT_ENABLED, @@ -419,9 +412,6 @@ impl TryFrom<&HashMap> for TableProperties { #[cfg(test)] mod tests { - use apache_avro::{DeflateSettings, ZstandardSettings}; - use miniz_oxide::deflate::CompressionLevel; - use super::*; use crate::compression::CompressionCodec; @@ -449,18 +439,15 @@ mod tests { table_properties.write_target_file_size_bytes, TableProperties::PROPERTY_WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT ); - // Test compression defaults (none means CompressionCodec::None) - assert_eq!( - table_properties.metadata_compression_codec, - CompressionCodec::None - ); + // Test compression defaults - should be gzip with no level specified assert_eq!( table_properties.avro_compression_codec, - TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string() + CompressionCodec::Gzip(None) ); + // Test compression defaults (none means CompressionCodec::None) assert_eq!( - table_properties.avro_compression_level, - TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT + table_properties.metadata_compression_codec, + CompressionCodec::None ); assert_eq!( table_properties.gc_enabled, @@ -515,12 +502,24 @@ mod tests { ), ]); let table_properties = TableProperties::try_from(&props).unwrap(); + // Check that it parsed to a Zstd codec with level 3 + assert_eq!( + table_properties.avro_compression_codec, + CompressionCodec::Zstd(Some(3)) + ); + } + + #[test] + fn test_table_properties_compression() { + let props = HashMap::from([( + TableProperties::PROPERTY_METADATA_COMPRESSION_CODEC.to_string(), + "gzip".to_string(), + )]); + let table_properties = TableProperties::try_from(&props).unwrap(); assert_eq!( table_properties.metadata_compression_codec, - CompressionCodec::gzip_default() + CompressionCodec::Gzip(None) ); - assert_eq!(table_properties.avro_compression_codec, "zstd"); - assert_eq!(table_properties.avro_compression_level, Some(3)); } #[test] @@ -546,7 +545,7 @@ mod tests { let table_properties = TableProperties::try_from(&props_upper).unwrap(); assert_eq!( table_properties.metadata_compression_codec, - CompressionCodec::gzip_default() + CompressionCodec::Gzip(None) ); // Test mixed case @@ -557,7 +556,7 @@ mod tests { let table_properties = TableProperties::try_from(&props_mixed).unwrap(); assert_eq!( table_properties.metadata_compression_codec, - CompressionCodec::gzip_default() + CompressionCodec::Gzip(None) ); // Test "NONE" should also be case-insensitive @@ -712,7 +711,7 @@ mod tests { )]); assert_eq!( parse_metadata_file_compression(&props).unwrap(), - CompressionCodec::gzip_default() + CompressionCodec::Gzip(None) ); // Test case insensitivity - "NONE" @@ -732,7 +731,7 @@ mod tests { )]); assert_eq!( parse_metadata_file_compression(&props).unwrap(), - CompressionCodec::gzip_default() + CompressionCodec::Gzip(None) ); // Test case insensitivity - "GzIp" @@ -742,7 +741,7 @@ mod tests { )]); assert_eq!( parse_metadata_file_compression(&props).unwrap(), - CompressionCodec::gzip_default() + CompressionCodec::Gzip(None) ); // Test default when property is missing diff --git a/crates/iceberg/src/transaction/snapshot.rs b/crates/iceberg/src/transaction/snapshot.rs index f139dd5dd5..2f52617a97 100644 --- a/crates/iceberg/src/transaction/snapshot.rs +++ b/crates/iceberg/src/transaction/snapshot.rs @@ -24,7 +24,7 @@ use futures::stream::FuturesUnordered; use uuid::Uuid; use crate::error::Result; -use crate::spec::avro_util::codec_from_str; +use crate::spec::avro_util::to_avro_codec; use crate::spec::{ DataFile, DataFileFormat, FormatVersion, MAIN_BRANCH, ManifestContentType, ManifestEntry, ManifestFile, ManifestListWriter, ManifestWriter, ManifestWriterBuilder, Operation, Snapshot, @@ -274,7 +274,7 @@ impl<'a> SnapshotProducer<'a> { .default_partition_spec() .as_ref() .clone(), - codec_from_str(Some(&table_props.avro_compression_codec), table_props.avro_compression_level), + table_props.avro_compression_codec, ); match self.table.metadata().format_version() { @@ -467,7 +467,7 @@ impl<'a> SnapshotProducer<'a> { .with_source(e) })?; - let compression = codec_from_str(Some(&table_props.avro_compression_codec), table_props.avro_compression_level); + let compression = to_avro_codec(table_props.avro_compression_codec); let mut manifest_list_writer = match self.table.metadata().format_version() { FormatVersion::V1 => ManifestListWriter::v1( From 8297287f396204d7058180e965d6e21f383613c5 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 26 Jun 2026 18:27:04 +0000 Subject: [PATCH 25/33] fixes --- crates/iceberg/src/io/object_cache.rs | 17 +++---- crates/iceberg/src/puffin/reader.rs | 2 +- crates/iceberg/src/puffin/writer.rs | 2 +- crates/iceberg/src/scan/mod.rs | 47 +++++++------------- crates/iceberg/src/spec/avro_util.rs | 40 +++++++++++------ crates/iceberg/src/spec/manifest/writer.rs | 4 +- crates/iceberg/src/spec/manifest_list/mod.rs | 1 - crates/iceberg/src/spec/table_metadata.rs | 1 + crates/iceberg/src/spec/table_properties.rs | 2 +- crates/iceberg/src/transaction/append.rs | 9 ++-- crates/iceberg/src/transaction/snapshot.rs | 2 +- 11 files changed, 64 insertions(+), 63 deletions(-) diff --git a/crates/iceberg/src/io/object_cache.rs b/crates/iceberg/src/io/object_cache.rs index 6164931201..d5b3bb16ad 100644 --- a/crates/iceberg/src/io/object_cache.rs +++ b/crates/iceberg/src/io/object_cache.rs @@ -212,6 +212,8 @@ mod tests { use tempfile::TempDir; use uuid::Uuid; + use apache_avro::Codec; + use super::*; use crate::TableIdent; use crate::compression::CompressionCodec; @@ -322,20 +324,15 @@ mod tests { let data_file_manifest = writer.write_manifest_file().await.unwrap(); // Write to manifest list - let manifest_list_writer = self - .table - .file_io() - .new_output(current_snapshot.manifest_list()) - .unwrap() - .writer() - .await - .unwrap(); let mut manifest_list_write = ManifestListWriter::v2( - manifest_list_writer, + self.table + .file_io() + .new_output(current_snapshot.manifest_list()) + .unwrap(), current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - CompressionCodec::None, + Codec::Null, ); manifest_list_write .add_manifests(vec![data_file_manifest].into_iter()) diff --git a/crates/iceberg/src/puffin/reader.rs b/crates/iceberg/src/puffin/reader.rs index c3d90e71f6..3e8e3294bf 100644 --- a/crates/iceberg/src/puffin/reader.rs +++ b/crates/iceberg/src/puffin/reader.rs @@ -153,7 +153,7 @@ mod tests { assert!(result.is_err()); let err = result.unwrap_err(); assert_eq!(err.kind(), ErrorKind::DataInvalid); - assert!(err.to_string().contains("gzip")); + assert!(err.to_string().to_lowercase().contains("gzip")); assert!( err.to_string() .contains("is not supported for Puffin files") diff --git a/crates/iceberg/src/puffin/writer.rs b/crates/iceberg/src/puffin/writer.rs index 718ee2740c..70584b435c 100644 --- a/crates/iceberg/src/puffin/writer.rs +++ b/crates/iceberg/src/puffin/writer.rs @@ -346,7 +346,7 @@ mod tests { assert!(result.is_err()); let err = result.unwrap_err(); assert_eq!(err.kind(), ErrorKind::DataInvalid); - assert!(err.to_string().contains("gzip")); + assert!(err.to_string().to_lowercase().contains("gzip")); assert!( err.to_string() .contains("is not supported for Puffin files") diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 3e427db615..1c74ce7e76 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -638,6 +638,8 @@ pub mod tests { use tempfile::TempDir; use uuid::Uuid; + use apache_avro::Codec; + use crate::arrow::ArrowReaderBuilder; use crate::compression::CompressionCodec; use crate::expr::{BoundPredicate, Reference}; @@ -920,20 +922,15 @@ pub mod tests { let data_file_manifest = writer.write_manifest_file().await.unwrap(); // Write to manifest list - let manifest_list_writer = self - .table - .file_io() - .new_output(current_snapshot.manifest_list()) - .unwrap() - .writer() - .await - .unwrap(); let mut manifest_list_write = ManifestListWriter::v2( - manifest_list_writer, + self.table + .file_io() + .new_output(current_snapshot.manifest_list()) + .unwrap(), current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - CompressionCodec::None, + Codec::Null, ); manifest_list_write .add_manifests(vec![data_file_manifest].into_iter()) @@ -1158,20 +1155,15 @@ pub mod tests { let data_file_manifest = writer.write_manifest_file().await.unwrap(); // Write to manifest list - let manifest_list_writer = self - .table - .file_io() - .new_output(current_snapshot.manifest_list()) - .unwrap() - .writer() - .await - .unwrap(); let mut manifest_list_write = ManifestListWriter::v2( - manifest_list_writer, + self.table + .file_io() + .new_output(current_snapshot.manifest_list()) + .unwrap(), current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - CompressionCodec::None, + Codec::Null, ); manifest_list_write .add_manifests(vec![data_file_manifest].into_iter()) @@ -1257,20 +1249,15 @@ pub mod tests { // Write to manifest list - DATA FIRST then DELETE // This order is crucial for reproduction - let manifest_list_writer = self - .table - .file_io() - .new_output(current_snapshot.manifest_list()) - .unwrap() - .writer() - .await - .unwrap(); let mut manifest_list_write = ManifestListWriter::v2( - manifest_list_writer, + self.table + .file_io() + .new_output(current_snapshot.manifest_list()) + .unwrap(), current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - CompressionCodec::None, + Codec::Null, ); manifest_list_write .add_manifests(vec![data_manifest, delete_manifest].into_iter()) diff --git a/crates/iceberg/src/spec/avro_util.rs b/crates/iceberg/src/spec/avro_util.rs index d25c2042ff..aebb261472 100644 --- a/crates/iceberg/src/spec/avro_util.rs +++ b/crates/iceberg/src/spec/avro_util.rs @@ -23,6 +23,8 @@ use miniz_oxide::deflate::CompressionLevel; use serde_json::Value; use crate::compression::CompressionCodec; +use crate::error::Result; +use crate::{Error, ErrorKind}; /// Codec name for gzip compression pub const CODEC_GZIP: &str = "gzip"; @@ -67,11 +69,15 @@ pub(crate) fn parse_avro_codec(codec: Option<&str>, level: Option) -> Compre /// Convert a [`CompressionCodec`] to an [`apache_avro::Codec`] for use in Avro writers. /// Codec-specific defaults are applied here (e.g. gzip defaults to level 9, zstd to level 1). -pub(crate) fn to_avro_codec(codec: CompressionCodec) -> Codec { +/// Returns an error for codecs that are not supported in Avro (e.g. LZ4). +pub(crate) fn to_avro_codec(codec: CompressionCodec) -> Result { match codec { - CompressionCodec::None => Codec::Null, - CompressionCodec::Snappy => Codec::Snappy, - CompressionCodec::Lz4 => Codec::Null, + CompressionCodec::None => Ok(Codec::Null), + CompressionCodec::Snappy => Ok(Codec::Snappy), + CompressionCodec::Lz4 => Err(Error::new( + ErrorKind::FeatureUnsupported, + "LZ4 compression is not supported for Avro files", + )), CompressionCodec::Gzip(level) => { let compression_level = match level.unwrap_or(DEFAULT_GZIP_LEVEL) { 0 => CompressionLevel::NoCompression, @@ -80,11 +86,11 @@ pub(crate) fn to_avro_codec(codec: CompressionCodec) -> Codec { 10 => CompressionLevel::UberCompression, _ => CompressionLevel::DefaultLevel, }; - Codec::Deflate(DeflateSettings::new(compression_level)) + Ok(Codec::Deflate(DeflateSettings::new(compression_level))) } CompressionCodec::Zstd(level) => { let zstd_level = level.unwrap_or(DEFAULT_ZSTD_LEVEL).min(MAX_ZSTD_LEVEL); - Codec::Zstandard(ZstandardSettings::new(zstd_level)) + Ok(Codec::Zstandard(ZstandardSettings::new(zstd_level))) } } } @@ -149,7 +155,7 @@ mod tests { #[test] fn test_to_avro_codec_gzip_default() { // None level → default 9 (BestCompression) - let avro_codec = to_avro_codec(CompressionCodec::Gzip(None)); + let avro_codec = to_avro_codec(CompressionCodec::Gzip(None)).unwrap(); assert_eq!( avro_codec, Codec::Deflate(DeflateSettings::new(CompressionLevel::BestCompression)) @@ -158,7 +164,7 @@ mod tests { #[test] fn test_to_avro_codec_gzip_level5() { - let avro_codec = to_avro_codec(CompressionCodec::Gzip(Some(5))); + let avro_codec = to_avro_codec(CompressionCodec::Gzip(Some(5))).unwrap(); assert_eq!( avro_codec, Codec::Deflate(DeflateSettings::new(CompressionLevel::DefaultLevel)) @@ -168,19 +174,19 @@ mod tests { #[test] fn test_to_avro_codec_zstd_default() { // None level → default 1 - let avro_codec = to_avro_codec(CompressionCodec::Zstd(None)); + let avro_codec = to_avro_codec(CompressionCodec::Zstd(None)).unwrap(); assert_eq!(avro_codec, Codec::Zstandard(ZstandardSettings::new(1))); } #[test] fn test_to_avro_codec_zstd_level3() { - let avro_codec = to_avro_codec(CompressionCodec::Zstd(Some(3))); + let avro_codec = to_avro_codec(CompressionCodec::Zstd(Some(3))).unwrap(); assert_eq!(avro_codec, Codec::Zstandard(ZstandardSettings::new(3))); } #[test] fn test_to_avro_codec_zstd_clamping() { - let avro_codec = to_avro_codec(CompressionCodec::Zstd(Some(MAX_ZSTD_LEVEL + 1))); + let avro_codec = to_avro_codec(CompressionCodec::Zstd(Some(MAX_ZSTD_LEVEL + 1))).unwrap(); assert_eq!( avro_codec, Codec::Zstandard(ZstandardSettings::new(MAX_ZSTD_LEVEL)) @@ -189,14 +195,22 @@ mod tests { #[test] fn test_to_avro_codec_null() { - assert!(matches!(to_avro_codec(CompressionCodec::None), Codec::Null)); + assert!(matches!( + to_avro_codec(CompressionCodec::None).unwrap(), + Codec::Null + )); } #[test] fn test_to_avro_codec_snappy() { assert!(matches!( - to_avro_codec(CompressionCodec::Snappy), + to_avro_codec(CompressionCodec::Snappy).unwrap(), Codec::Snappy )); } + + #[test] + fn test_to_avro_codec_lz4_unsupported() { + assert!(to_avro_codec(CompressionCodec::Lz4).is_err()); + } } diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index 0280db063f..b1a572f8ad 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -446,7 +446,7 @@ impl ManifestWriter { let mut avro_writer = AvroWriter::with_codec( &avro_schema, Vec::new(), - avro_util::to_avro_codec(self.compression), + avro_util::to_avro_codec(self.compression)?, ); avro_writer.add_user_metadata( "schema".to_string(), @@ -825,7 +825,7 @@ mod tests { None, schema.clone(), partition_spec.clone(), - Codec::Null, + CompressionCodec::None, ) .build_v3_deletes(); diff --git a/crates/iceberg/src/spec/manifest_list/mod.rs b/crates/iceberg/src/spec/manifest_list/mod.rs index d6d606dc8b..a673e626af 100644 --- a/crates/iceberg/src/spec/manifest_list/mod.rs +++ b/crates/iceberg/src/spec/manifest_list/mod.rs @@ -100,7 +100,6 @@ mod test { use super::_serde::ManifestFileV2; use super::*; use crate::io::FileIO; - use apache_avro::Codec; use crate::spec::{Datum, FieldSummary, ManifestContentType, ManifestFile}; diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index 786a5d3d35..a39ec193ac 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -1614,6 +1614,7 @@ mod tests { use crate::catalog::MetadataLocation; use crate::compression::CompressionCodec; use crate::io::FileIO; + use crate::spec::TableProperties; use crate::spec::table_metadata::TableMetadata; use crate::spec::{ BlobMetadata, EncryptedKey, INITIAL_ROW_ID, Literal, NestedField, NullOrder, Operation, diff --git a/crates/iceberg/src/spec/table_properties.rs b/crates/iceberg/src/spec/table_properties.rs index 5785e5c58c..457458bc0c 100644 --- a/crates/iceberg/src/spec/table_properties.rs +++ b/crates/iceberg/src/spec/table_properties.rs @@ -490,7 +490,7 @@ mod tests { } #[test] - fn test_table_properties_compression() { + fn test_table_properties_avro_compression() { let props = HashMap::from([ ( TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC.to_string(), diff --git a/crates/iceberg/src/transaction/append.rs b/crates/iceberg/src/transaction/append.rs index 36fde117ab..b2fb0e8846 100644 --- a/crates/iceberg/src/transaction/append.rs +++ b/crates/iceberg/src/transaction/append.rs @@ -158,6 +158,9 @@ mod tests { use tempfile::TempDir; use uuid::Uuid; + use apache_avro::Codec; + + use crate::compression::CompressionCodec; use crate::io::FileIO; use crate::spec::{ DataContentType, DataFileBuilder, DataFileFormat, Literal, MAIN_BRANCH, ManifestEntry, @@ -236,6 +239,7 @@ mod tests { None, schema.clone(), partition_spec.as_ref().clone(), + CompressionCodec::None, ) .build_v2_data(); data_writer @@ -266,6 +270,7 @@ mod tests { None, schema.clone(), partition_spec.as_ref().clone(), + CompressionCodec::None, ) .build_v2_data(); delete_writer @@ -301,13 +306,11 @@ mod tests { table .file_io() .new_output(current_snapshot.manifest_list()) - .unwrap() - .writer() - .await .unwrap(), current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), + Codec::Null, ); manifest_list_write .add_manifests(vec![data_manifest, delete_manifest].into_iter()) diff --git a/crates/iceberg/src/transaction/snapshot.rs b/crates/iceberg/src/transaction/snapshot.rs index 2f52617a97..77f09152a5 100644 --- a/crates/iceberg/src/transaction/snapshot.rs +++ b/crates/iceberg/src/transaction/snapshot.rs @@ -467,7 +467,7 @@ impl<'a> SnapshotProducer<'a> { .with_source(e) })?; - let compression = to_avro_codec(table_props.avro_compression_codec); + let compression = to_avro_codec(table_props.avro_compression_codec)?; let mut manifest_list_writer = match self.table.metadata().format_version() { FormatVersion::V1 => ManifestListWriter::v1( From 469d2302bfaead7b50c4be6cfe92267c8e592def Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 26 Jun 2026 20:00:13 +0000 Subject: [PATCH 26/33] wip --- crates/iceberg/src/compression.rs | 17 +++++++++ crates/iceberg/src/spec/avro_util.rs | 16 +++++---- crates/iceberg/src/spec/manifest/writer.rs | 3 ++ .../iceberg/src/spec/manifest_list/writer.rs | 3 +- crates/iceberg/src/spec/table_metadata.rs | 1 - crates/iceberg/src/spec/table_properties.rs | 1 - crates/iceberg/src/transaction/snapshot.rs | 36 ++++++++----------- 7 files changed, 46 insertions(+), 31 deletions(-) diff --git a/crates/iceberg/src/compression.rs b/crates/iceberg/src/compression.rs index 9d1bac752f..f8ba17b39d 100644 --- a/crates/iceberg/src/compression.rs +++ b/crates/iceberg/src/compression.rs @@ -242,6 +242,23 @@ mod tests { assert!(zstd_err.to_string().contains("suffix not defined for Zstd")); } + #[test] + fn test_with_level_ignores_level_for_level_less_variants() { + // Snappy and Lz4 don't carry a level; with_level must return them unchanged. + assert_eq!( + CompressionCodec::Snappy.with_level(Some(5)), + CompressionCodec::Snappy + ); + assert_eq!( + CompressionCodec::Lz4.with_level(Some(5)), + CompressionCodec::Lz4 + ); + assert_eq!( + CompressionCodec::None.with_level(Some(5)), + CompressionCodec::None + ); + } + #[test] fn test_display() { assert_eq!(CompressionCodec::None.to_string(), "None"); diff --git a/crates/iceberg/src/spec/avro_util.rs b/crates/iceberg/src/spec/avro_util.rs index aebb261472..01cf4062b4 100644 --- a/crates/iceberg/src/spec/avro_util.rs +++ b/crates/iceberg/src/spec/avro_util.rs @@ -26,12 +26,6 @@ use crate::compression::CompressionCodec; use crate::error::Result; use crate::{Error, ErrorKind}; -/// Codec name for gzip compression -pub const CODEC_GZIP: &str = "gzip"; -/// Codec name for zstd compression -pub const CODEC_ZSTD: &str = "zstd"; -/// Codec name for snappy compression -pub const CODEC_SNAPPY: &str = "snappy"; /// Codec name for uncompressed pub const CODEC_UNCOMPRESSED: &str = "uncompressed"; @@ -43,7 +37,12 @@ const DEFAULT_ZSTD_LEVEL: u8 = 1; const MAX_ZSTD_LEVEL: u8 = 22; /// Parse codec name and optional level into a [`CompressionCodec`]. -/// Returns `CompressionCodec::None` for unknown or unsupported codecs. +/// +/// Unknown codec names fall back to `CompressionCodec::None` with a `warn!` log entry rather +/// than returning an error. This is intentional graceful degradation: the Avro spec allows +/// readers to tolerate unknown codec metadata fields, and a hard error here would break +/// reading tables written by newer implementations. Callers that need strict validation should +/// check the codec before writing. /// /// # Arguments /// @@ -79,6 +78,9 @@ pub(crate) fn to_avro_codec(codec: CompressionCodec) -> Result { "LZ4 compression is not supported for Avro files", )), CompressionCodec::Gzip(level) => { + // miniz_oxide::CompressionLevel only exposes a handful of named levels, so levels + // 2–8 and 11+ all map to DefaultLevel. This is a library granularity limitation, + // not a bug — the output is valid gzip at whatever level miniz chooses. let compression_level = match level.unwrap_or(DEFAULT_GZIP_LEVEL) { 0 => CompressionLevel::NoCompression, 1 => CompressionLevel::BestSpeed, diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index b1a572f8ad..ce7f5121bf 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -43,6 +43,9 @@ use crate::{Error, ErrorKind}; /// with the actual snapshot ID before it is committed. const UNASSIGNED_SNAPSHOT_ID: i64 = -1; +// The future is pinned eagerly (rather than holding OutputFile) because OutputFile is consumed +// by `output.writer().await` — we can't store it and call writer() later. Capturing the future +// at construction time lets us defer the async work while still owning the location string. type WriterFuture = Pin>> + Send>>; /// The builder used to create a [`ManifestWriter`]. diff --git a/crates/iceberg/src/spec/manifest_list/writer.rs b/crates/iceberg/src/spec/manifest_list/writer.rs index 963fdce5c9..ac1e5a8087 100644 --- a/crates/iceberg/src/spec/manifest_list/writer.rs +++ b/crates/iceberg/src/spec/manifest_list/writer.rs @@ -251,7 +251,8 @@ impl ManifestListWriter { Ok(()) } - /// Returns number of newly assigned first-row-ids, if any. + /// Assigns `manifest.first_row_id` if not already set, advancing `self.next_row_id` + /// by the manifest's record count. fn assign_first_row_id(&mut self, manifest: &mut ManifestFile) -> Result<()> { match manifest.content { ManifestContentType::Data => { diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index a39ec193ac..c1f09ac0f5 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -3719,7 +3719,6 @@ mod tests { assert_eq!(read_metadata, compressed_metadata); } - #[test] fn test_partition_name_exists() { let schema = Schema::builder() diff --git a/crates/iceberg/src/spec/table_properties.rs b/crates/iceberg/src/spec/table_properties.rs index 457458bc0c..ebf4f55bb0 100644 --- a/crates/iceberg/src/spec/table_properties.rs +++ b/crates/iceberg/src/spec/table_properties.rs @@ -94,7 +94,6 @@ pub(crate) fn parse_metadata_file_compression( } } - /// TableProperties that contains the properties of a table. #[derive(Debug)] pub struct TableProperties { diff --git a/crates/iceberg/src/transaction/snapshot.rs b/crates/iceberg/src/transaction/snapshot.rs index 77f09152a5..be888ac410 100644 --- a/crates/iceberg/src/transaction/snapshot.rs +++ b/crates/iceberg/src/transaction/snapshot.rs @@ -23,6 +23,7 @@ use futures::TryStreamExt; use futures::stream::FuturesUnordered; use uuid::Uuid; +use crate::compression::CompressionCodec; use crate::error::Result; use crate::spec::avro_util::to_avro_codec; use crate::spec::{ @@ -166,6 +167,18 @@ impl<'a> SnapshotProducer<'a> { Ok(()) } + fn avro_compression_codec(&self) -> Result { + TableProperties::try_from(self.table.metadata().properties()) + .map(|p| p.avro_compression_codec) + .map_err(|e| { + Error::new( + ErrorKind::DataInvalid, + "Failed to parse table properties for compression settings", + ) + .with_source(e) + }) + } + pub(crate) async fn validate_duplicate_files(&self) -> Result<()> { let Some(current_snapshot) = self.table.metadata().current_snapshot() else { return Ok(()); @@ -254,16 +267,6 @@ impl<'a> SnapshotProducer<'a> { ); let output_file = self.table.file_io().new_output(new_manifest_path)?; - // Get compression settings from table properties - let table_props = - TableProperties::try_from(self.table.metadata().properties()).map_err(|e| { - Error::new( - ErrorKind::DataInvalid, - "Failed to parse table properties for compression settings", - ) - .with_source(e) - })?; - let builder = ManifestWriterBuilder::new( output_file, Some(self.snapshot_id), @@ -274,7 +277,7 @@ impl<'a> SnapshotProducer<'a> { .default_partition_spec() .as_ref() .clone(), - table_props.avro_compression_codec, + self.avro_compression_codec()?, ); match self.table.metadata().format_version() { @@ -458,16 +461,7 @@ impl<'a> SnapshotProducer<'a> { .table .file_io() .new_output(manifest_list_path.clone())?; - let table_props = - TableProperties::try_from(self.table.metadata().properties()).map_err(|e| { - Error::new( - ErrorKind::DataInvalid, - "Failed to parse table properties for compression settings", - ) - .with_source(e) - })?; - - let compression = to_avro_codec(table_props.avro_compression_codec)?; + let compression = to_avro_codec(self.avro_compression_codec()?)?; let mut manifest_list_writer = match self.table.metadata().format_version() { FormatVersion::V1 => ManifestListWriter::v1( From 09be7a79d40ce6141857a3d908dbcd4a6fe2adf9 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 26 Jun 2026 20:03:27 +0000 Subject: [PATCH 27/33] error --- crates/iceberg/src/spec/avro_util.rs | 47 +++++++++++---------- crates/iceberg/src/spec/table_properties.rs | 2 +- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/crates/iceberg/src/spec/avro_util.rs b/crates/iceberg/src/spec/avro_util.rs index 01cf4062b4..b68bd62200 100644 --- a/crates/iceberg/src/spec/avro_util.rs +++ b/crates/iceberg/src/spec/avro_util.rs @@ -18,7 +18,6 @@ //! Utilities for working with Apache Avro in Iceberg. use apache_avro::{Codec, DeflateSettings, ZstandardSettings}; -use log::warn; use miniz_oxide::deflate::CompressionLevel; use serde_json::Value; @@ -38,32 +37,31 @@ const MAX_ZSTD_LEVEL: u8 = 22; /// Parse codec name and optional level into a [`CompressionCodec`]. /// -/// Unknown codec names fall back to `CompressionCodec::None` with a `warn!` log entry rather -/// than returning an error. This is intentional graceful degradation: the Avro spec allows -/// readers to tolerate unknown codec metadata fields, and a hard error here would break -/// reading tables written by newer implementations. Callers that need strict validation should -/// check the codec before writing. +/// Returns `DataInvalid` for unrecognized codec names, matching the Java implementation which +/// throws `IllegalArgumentException("Unsupported compression codec: ...")` in the same case. /// /// # Arguments /// /// * `codec` - The name of the compression codec (e.g., "gzip", "zstd", "snappy", "uncompressed") /// * `level` - Optional compression level stored as-is; codec-specific defaults are applied /// in [`to_avro_codec`] at the point of use. -pub(crate) fn parse_avro_codec(codec: Option<&str>, level: Option) -> CompressionCodec { +pub(crate) fn parse_avro_codec(codec: Option<&str>, level: Option) -> Result { let Some(codec_str) = codec else { - return CompressionCodec::None; + return Ok(CompressionCodec::None); }; let lowercase = codec_str.to_lowercase(); // "uncompressed" is Avro-specific and not in the standard CompressionCodec names if lowercase == CODEC_UNCOMPRESSED { - return CompressionCodec::None; + return Ok(CompressionCodec::None); } let base: CompressionCodec = - serde_json::from_value(Value::String(lowercase)).unwrap_or_else(|_| { - warn!("Unrecognized compression codec '{codec_str}', using no compression"); - CompressionCodec::None - }); - base.with_level(level) + serde_json::from_value(Value::String(lowercase)).map_err(|_| { + Error::new( + ErrorKind::DataInvalid, + format!("Unsupported compression codec: {codec_str}"), + ) + })?; + Ok(base.with_level(level)) } /// Convert a [`CompressionCodec`] to an [`apache_avro::Codec`] for use in Avro writers. @@ -107,50 +105,53 @@ mod tests { #[test] fn test_parse_avro_codec_gzip() { // Test with mixed case to verify case-insensitive matching - let codec = parse_avro_codec(Some("GZip"), Some(5)); + let codec = parse_avro_codec(Some("GZip"), Some(5)).unwrap(); assert_eq!(codec, CompressionCodec::Gzip(Some(5))); } #[test] fn test_parse_avro_codec_snappy() { - let codec = parse_avro_codec(Some("snappy"), None); + let codec = parse_avro_codec(Some("snappy"), None).unwrap(); assert_eq!(codec, CompressionCodec::Snappy); } #[test] fn test_parse_avro_codec_zstd() { - let codec = parse_avro_codec(Some("zstd"), Some(3)); + let codec = parse_avro_codec(Some("zstd"), Some(3)).unwrap(); assert_eq!(codec, CompressionCodec::Zstd(Some(3))); } #[test] fn test_parse_avro_codec_uncompressed() { - let codec = parse_avro_codec(Some("uncompressed"), None); + let codec = parse_avro_codec(Some("uncompressed"), None).unwrap(); assert_eq!(codec, CompressionCodec::None); } #[test] fn test_parse_avro_codec_null() { - let codec = parse_avro_codec(None, None); + let codec = parse_avro_codec(None, None).unwrap(); assert_eq!(codec, CompressionCodec::None); } #[test] fn test_parse_avro_codec_unknown() { - let codec = parse_avro_codec(Some("unknown"), Some(1)); - assert_eq!(codec, CompressionCodec::None); + let err = parse_avro_codec(Some("unknown"), Some(1)).unwrap_err(); + assert!( + err.to_string().contains("Unsupported compression codec: unknown"), + "unexpected error: {err}" + ); } #[test] fn test_parse_avro_codec_gzip_no_level() { // Level is stored as-is (None), default applied in to_avro_codec - let codec = parse_avro_codec(Some("gzip"), None); + let codec = parse_avro_codec(Some("gzip"), None).unwrap(); assert_eq!(codec, CompressionCodec::Gzip(None)); } #[test] fn test_parse_avro_codec_zstd_no_level() { - let codec = parse_avro_codec(Some("zstd"), None); + let codec = parse_avro_codec(Some("zstd"), None).unwrap(); assert_eq!(codec, CompressionCodec::Zstd(None)); } diff --git a/crates/iceberg/src/spec/table_properties.rs b/crates/iceberg/src/spec/table_properties.rs index ebf4f55bb0..9478dfd163 100644 --- a/crates/iceberg/src/spec/table_properties.rs +++ b/crates/iceberg/src/spec/table_properties.rs @@ -349,7 +349,7 @@ impl TryFrom<&HashMap> for TableProperties { )?; let level = if level_raw == 255 { None } else { Some(level_raw) }; - avro_util::parse_avro_codec(Some(&codec_name), level) + avro_util::parse_avro_codec(Some(&codec_name), level)? }, metadata_compression_codec: parse_metadata_file_compression(props)?, write_datafusion_fanout_enabled: parse_property( From 8f6bcd0c7222ed80b7fa32bf4b2582f2afbbc57b Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 26 Jun 2026 20:23:51 +0000 Subject: [PATCH 28/33] wip --- Cargo.lock | 1 - crates/iceberg/Cargo.toml | 1 - crates/iceberg/public-api.txt | 30 ++++----- crates/iceberg/src/compression.rs | 2 + crates/iceberg/src/spec/avro_util.rs | 5 +- .../iceberg/src/spec/manifest_list/writer.rs | 63 ++++++++++++++++++- 6 files changed, 80 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e1a8d958db..8cc236b5c5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3770,7 +3770,6 @@ dependencies = [ "futures", "iceberg_test_utils", "itertools 0.13.0", - "log", "minijinja", "miniz_oxide", "mockall", diff --git a/crates/iceberg/Cargo.toml b/crates/iceberg/Cargo.toml index cca9ddcbb3..2f8cead346 100644 --- a/crates/iceberg/Cargo.toml +++ b/crates/iceberg/Cargo.toml @@ -59,7 +59,6 @@ flate2 = { workspace = true } fnv = { workspace = true } futures = { workspace = true } itertools = { workspace = true } -log = { workspace = true } miniz_oxide = { workspace = true } moka = { version = "0.12.10", features = ["future"] } murmur3 = { workspace = true } diff --git a/crates/iceberg/public-api.txt b/crates/iceberg/public-api.txt index 653649e6cf..73e1309a29 100644 --- a/crates/iceberg/public-api.txt +++ b/crates/iceberg/public-api.txt @@ -141,15 +141,11 @@ pub fn iceberg::cache::ObjectCacheProvide::manifest_list_cache(&self) -> &dyn ic pub type iceberg::cache::ObjectCacheProvider = alloc::sync::Arc pub mod iceberg::compression pub enum iceberg::compression::CompressionCodec -pub iceberg::compression::CompressionCodec::Gzip(u8) +pub iceberg::compression::CompressionCodec::Gzip(core::option::Option) pub iceberg::compression::CompressionCodec::Lz4 pub iceberg::compression::CompressionCodec::None pub iceberg::compression::CompressionCodec::Snappy -pub iceberg::compression::CompressionCodec::Zstd(u8) -impl iceberg::compression::CompressionCodec -pub const fn iceberg::compression::CompressionCodec::gzip_default() -> Self -pub fn iceberg::compression::CompressionCodec::name(&self) -> &'static str -pub const fn iceberg::compression::CompressionCodec::zstd_default() -> Self +pub iceberg::compression::CompressionCodec::Zstd(core::option::Option) impl iceberg::compression::CompressionCodec pub fn iceberg::compression::CompressionCodec::suffix(&self) -> iceberg::Result<&'static str> impl core::clone::Clone for iceberg::compression::CompressionCodec @@ -1121,15 +1117,11 @@ pub fn iceberg::metadata_columns::row_id_field() -> &'static iceberg::spec::Nest pub fn iceberg::metadata_columns::spec_id_field() -> &'static iceberg::spec::NestedFieldRef pub mod iceberg::puffin pub enum iceberg::puffin::CompressionCodec -pub iceberg::puffin::CompressionCodec::Gzip(u8) +pub iceberg::puffin::CompressionCodec::Gzip(core::option::Option) pub iceberg::puffin::CompressionCodec::Lz4 pub iceberg::puffin::CompressionCodec::None pub iceberg::puffin::CompressionCodec::Snappy -pub iceberg::puffin::CompressionCodec::Zstd(u8) -impl iceberg::compression::CompressionCodec -pub const fn iceberg::compression::CompressionCodec::gzip_default() -> Self -pub fn iceberg::compression::CompressionCodec::name(&self) -> &'static str -pub const fn iceberg::compression::CompressionCodec::zstd_default() -> Self +pub iceberg::puffin::CompressionCodec::Zstd(core::option::Option) impl iceberg::compression::CompressionCodec pub fn iceberg::compression::CompressionCodec::suffix(&self) -> iceberg::Result<&'static str> impl core::clone::Clone for iceberg::compression::CompressionCodec @@ -2037,9 +2029,9 @@ impl iceberg::spec::ManifestListWriter pub fn iceberg::spec::ManifestListWriter::add_manifests(&mut self, manifests: impl core::iter::traits::iterator::Iterator) -> iceberg::Result<()> pub async fn iceberg::spec::ManifestListWriter::close(self) -> iceberg::Result<()> pub fn iceberg::spec::ManifestListWriter::next_row_id(&self) -> core::option::Option -pub fn iceberg::spec::ManifestListWriter::v1(writer: alloc::boxed::Box, snapshot_id: i64, parent_snapshot_id: core::option::Option) -> Self -pub fn iceberg::spec::ManifestListWriter::v2(writer: alloc::boxed::Box, snapshot_id: i64, parent_snapshot_id: core::option::Option, sequence_number: i64) -> Self -pub fn iceberg::spec::ManifestListWriter::v3(writer: alloc::boxed::Box, snapshot_id: i64, parent_snapshot_id: core::option::Option, sequence_number: i64, first_row_id: core::option::Option) -> Self +pub fn iceberg::spec::ManifestListWriter::v1(output_file: iceberg::io::OutputFile, snapshot_id: i64, parent_snapshot_id: core::option::Option, compression: apache_avro::codec::Codec) -> Self +pub fn iceberg::spec::ManifestListWriter::v2(output_file: iceberg::io::OutputFile, snapshot_id: i64, parent_snapshot_id: core::option::Option, sequence_number: i64, compression: apache_avro::codec::Codec) -> Self +pub fn iceberg::spec::ManifestListWriter::v3(output_file: iceberg::io::OutputFile, snapshot_id: i64, parent_snapshot_id: core::option::Option, sequence_number: i64, first_row_id: core::option::Option, compression: apache_avro::codec::Codec) -> Self impl core::fmt::Debug for iceberg::spec::ManifestListWriter pub fn iceberg::spec::ManifestListWriter::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result pub struct iceberg::spec::ManifestMetadata @@ -2078,8 +2070,7 @@ pub fn iceberg::spec::ManifestWriterBuilder::build_v2_data(self) -> iceberg::spe pub fn iceberg::spec::ManifestWriterBuilder::build_v2_deletes(self) -> iceberg::spec::ManifestWriter pub fn iceberg::spec::ManifestWriterBuilder::build_v3_data(self) -> iceberg::spec::ManifestWriter pub fn iceberg::spec::ManifestWriterBuilder::build_v3_deletes(self) -> iceberg::spec::ManifestWriter -pub fn iceberg::spec::ManifestWriterBuilder::new(output: iceberg::io::OutputFile, snapshot_id: core::option::Option, key_metadata: core::option::Option>, schema: iceberg::spec::SchemaRef, partition_spec: iceberg::spec::PartitionSpec) -> Self -pub fn iceberg::spec::ManifestWriterBuilder::new_from_encrypted(encrypted_output: iceberg::encryption::EncryptedOutputFile, snapshot_id: core::option::Option, key_metadata: core::option::Option>, schema: iceberg::spec::SchemaRef, partition_spec: iceberg::spec::PartitionSpec) -> Self +pub fn iceberg::spec::ManifestWriterBuilder::new(output: iceberg::io::OutputFile, snapshot_id: core::option::Option, key_metadata: core::option::Option>, schema: iceberg::spec::SchemaRef, partition_spec: iceberg::spec::PartitionSpec, compression: iceberg::compression::CompressionCodec) -> Self pub struct iceberg::spec::Map impl iceberg::spec::Map pub fn iceberg::spec::Map::get(&self, key: &iceberg::spec::Literal) -> core::option::Option<&core::option::Option> @@ -2694,6 +2685,7 @@ pub fn iceberg::spec::TableMetadataBuilder::clone(&self) -> iceberg::spec::Table impl core::fmt::Debug for iceberg::spec::TableMetadataBuilder pub fn iceberg::spec::TableMetadataBuilder::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result pub struct iceberg::spec::TableProperties +pub iceberg::spec::TableProperties::avro_compression_codec: iceberg::compression::CompressionCodec pub iceberg::spec::TableProperties::cdc_enabled: bool pub iceberg::spec::TableProperties::cdc_max_chunk_size: usize pub iceberg::spec::TableProperties::cdc_min_chunk_size: usize @@ -2713,6 +2705,10 @@ pub iceberg::spec::TableProperties::write_datafusion_fanout_enabled: bool pub iceberg::spec::TableProperties::write_format_default: alloc::string::String pub iceberg::spec::TableProperties::write_target_file_size_bytes: usize impl iceberg::spec::TableProperties +pub const iceberg::spec::TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC: &str +pub const iceberg::spec::TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT: &str +pub const iceberg::spec::TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL: &str +pub const iceberg::spec::TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT: core::option::Option pub const iceberg::spec::TableProperties::PROPERTY_COMMIT_MAX_RETRY_WAIT_MS: &str pub const iceberg::spec::TableProperties::PROPERTY_COMMIT_MAX_RETRY_WAIT_MS_DEFAULT: u64 pub const iceberg::spec::TableProperties::PROPERTY_COMMIT_MIN_RETRY_WAIT_MS: &str diff --git a/crates/iceberg/src/compression.rs b/crates/iceberg/src/compression.rs index f8ba17b39d..ef5c05ca77 100644 --- a/crates/iceberg/src/compression.rs +++ b/crates/iceberg/src/compression.rs @@ -27,6 +27,8 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer}; use crate::{Error, ErrorKind, Result}; +// General-purpose defaults for data-file compression. Avro manifest defaults are different +// (gzip=9, zstd=1) and are defined in avro_util.rs to match the Java implementation. const ZSTD_DEFAULT_LEVEL: u8 = 3; const GZIP_DEFAULT_LEVEL: u8 = 6; const GZIP_MAX_LEVEL: u8 = 9; diff --git a/crates/iceberg/src/spec/avro_util.rs b/crates/iceberg/src/spec/avro_util.rs index b68bd62200..65a2771648 100644 --- a/crates/iceberg/src/spec/avro_util.rs +++ b/crates/iceberg/src/spec/avro_util.rs @@ -28,9 +28,10 @@ use crate::{Error, ErrorKind}; /// Codec name for uncompressed pub const CODEC_UNCOMPRESSED: &str = "uncompressed"; -/// Default compression level for gzip (matches Java implementation) +// These Avro-specific defaults match the Java Iceberg implementation (GzipCodec/ZstandardCodec). +// They differ intentionally from the general-purpose defaults in compression.rs (gzip=6, zstd=3), +// which are used for data-file compression and are not Java-compatible by design. const DEFAULT_GZIP_LEVEL: u8 = 9; -/// Default compression level for zstd (matches Java implementation) const DEFAULT_ZSTD_LEVEL: u8 = 1; /// Max supported level for ZSTD const MAX_ZSTD_LEVEL: u8 = 22; diff --git a/crates/iceberg/src/spec/manifest_list/writer.rs b/crates/iceberg/src/spec/manifest_list/writer.rs index ac1e5a8087..91f178895b 100644 --- a/crates/iceberg/src/spec/manifest_list/writer.rs +++ b/crates/iceberg/src/spec/manifest_list/writer.rs @@ -212,7 +212,7 @@ impl ManifestListWriter { } /// Write the manifest list to the output file. - pub async fn close(mut self) -> Result<()> { + pub async fn close(self) -> Result<()> { let data = self.avro_writer.into_inner()?; let mut writer = self.output_file.writer().await?; writer.write(Bytes::from(data)).await?; @@ -623,6 +623,67 @@ mod test { temp_dir.close().unwrap(); } + #[tokio::test] + async fn test_manifest_list_writer_with_compression() { + let snapshot_id = 377075049360453639; + let seq_num = 1; + + let entries: Vec = (0..1000) + .map(|i| ManifestFile { + manifest_path: format!( + "s3a://icebergdata/demo/s1/t1/metadata/very-long-path-for-compression-test/manifest-file-number-{i}.avro" + ), + manifest_length: 6926 + i, + partition_spec_id: 1, + content: ManifestContentType::Data, + sequence_number: seq_num, + min_sequence_number: seq_num, + added_snapshot_id: snapshot_id, + added_files_count: Some(1), + existing_files_count: Some(0), + deleted_files_count: Some(0), + added_rows_count: Some(3), + existing_rows_count: Some(0), + deleted_rows_count: Some(0), + partitions: None, + key_metadata: None, + first_row_id: None, + }) + .collect(); + + let tmp_dir = TempDir::new().unwrap(); + let io = FileIO::new_with_fs(); + + let uncompressed_path = tmp_dir.path().join("manifest_list_uncompressed.avro"); + let mut writer = ManifestListWriter::v2( + output_file(&uncompressed_path, &io), + snapshot_id, + Some(0), + seq_num, + Codec::Null, + ); + writer.add_manifests(entries.clone().into_iter()).unwrap(); + writer.close().await.unwrap(); + let uncompressed_size = fs::metadata(&uncompressed_path).unwrap().len(); + + let compressed_path = tmp_dir.path().join("manifest_list_compressed.avro"); + let mut writer = ManifestListWriter::v2( + output_file(&compressed_path, &io), + snapshot_id, + Some(0), + seq_num, + Codec::Deflate(apache_avro::DeflateSettings::default()), + ); + writer.add_manifests(entries.into_iter()).unwrap(); + writer.close().await.unwrap(); + let compressed_size = fs::metadata(&compressed_path).unwrap().len(); + + assert!( + compressed_size < uncompressed_size, + "compressed size ({compressed_size}) should be less than uncompressed size ({uncompressed_size})" + ); + } + fn output_file(path: &Path, io: &FileIO) -> crate::io::OutputFile { io.new_output(path.to_str().unwrap()).unwrap() } From e4188e138cb80fb136ecb6e796135eae664a2739 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 26 Jun 2026 20:28:40 +0000 Subject: [PATCH 29/33] wip --- crates/iceberg/src/spec/table_properties.rs | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/crates/iceberg/src/spec/table_properties.rs b/crates/iceberg/src/spec/table_properties.rs index 9478dfd163..77ae0ca250 100644 --- a/crates/iceberg/src/spec/table_properties.rs +++ b/crates/iceberg/src/spec/table_properties.rs @@ -341,13 +341,20 @@ impl TryFrom<&HashMap> for TableProperties { TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), )?; - // Sentinel value 255 means level was not specified - let level_raw = parse_property( - props, - TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL, - 255u8, - )?; - let level = if level_raw == 255 { None } else { Some(level_raw) }; + let level = props + .get(TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL) + .map(|v| { + v.parse::().map_err(|e| { + Error::new( + ErrorKind::DataInvalid, + format!( + "Invalid value for {}: {e}", + TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL + ), + ) + }) + }) + .transpose()?; avro_util::parse_avro_codec(Some(&codec_name), level)? }, From e1ff71d5a812046bfa1f58846e1e18681ad3eac6 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 26 Jun 2026 20:40:01 +0000 Subject: [PATCH 30/33] fix --- crates/iceberg/public-api.txt | 6 +-- crates/iceberg/src/io/object_cache.rs | 7 ++- crates/iceberg/src/scan/mod.rs | 17 +++--- crates/iceberg/src/spec/manifest_list/mod.rs | 16 +++--- .../iceberg/src/spec/manifest_list/writer.rs | 52 ++++++++++--------- crates/iceberg/src/transaction/append.rs | 7 ++- crates/iceberg/src/transaction/snapshot.rs | 9 ++-- 7 files changed, 60 insertions(+), 54 deletions(-) diff --git a/crates/iceberg/public-api.txt b/crates/iceberg/public-api.txt index 73e1309a29..48446da160 100644 --- a/crates/iceberg/public-api.txt +++ b/crates/iceberg/public-api.txt @@ -2029,9 +2029,9 @@ impl iceberg::spec::ManifestListWriter pub fn iceberg::spec::ManifestListWriter::add_manifests(&mut self, manifests: impl core::iter::traits::iterator::Iterator) -> iceberg::Result<()> pub async fn iceberg::spec::ManifestListWriter::close(self) -> iceberg::Result<()> pub fn iceberg::spec::ManifestListWriter::next_row_id(&self) -> core::option::Option -pub fn iceberg::spec::ManifestListWriter::v1(output_file: iceberg::io::OutputFile, snapshot_id: i64, parent_snapshot_id: core::option::Option, compression: apache_avro::codec::Codec) -> Self -pub fn iceberg::spec::ManifestListWriter::v2(output_file: iceberg::io::OutputFile, snapshot_id: i64, parent_snapshot_id: core::option::Option, sequence_number: i64, compression: apache_avro::codec::Codec) -> Self -pub fn iceberg::spec::ManifestListWriter::v3(output_file: iceberg::io::OutputFile, snapshot_id: i64, parent_snapshot_id: core::option::Option, sequence_number: i64, first_row_id: core::option::Option, compression: apache_avro::codec::Codec) -> Self +pub fn iceberg::spec::ManifestListWriter::v1(output_file: iceberg::io::OutputFile, snapshot_id: i64, parent_snapshot_id: core::option::Option, compression: iceberg::compression::CompressionCodec) -> iceberg::Result +pub fn iceberg::spec::ManifestListWriter::v2(output_file: iceberg::io::OutputFile, snapshot_id: i64, parent_snapshot_id: core::option::Option, sequence_number: i64, compression: iceberg::compression::CompressionCodec) -> iceberg::Result +pub fn iceberg::spec::ManifestListWriter::v3(output_file: iceberg::io::OutputFile, snapshot_id: i64, parent_snapshot_id: core::option::Option, sequence_number: i64, first_row_id: core::option::Option, compression: iceberg::compression::CompressionCodec) -> iceberg::Result impl core::fmt::Debug for iceberg::spec::ManifestListWriter pub fn iceberg::spec::ManifestListWriter::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result pub struct iceberg::spec::ManifestMetadata diff --git a/crates/iceberg/src/io/object_cache.rs b/crates/iceberg/src/io/object_cache.rs index d5b3bb16ad..51800888dd 100644 --- a/crates/iceberg/src/io/object_cache.rs +++ b/crates/iceberg/src/io/object_cache.rs @@ -212,8 +212,6 @@ mod tests { use tempfile::TempDir; use uuid::Uuid; - use apache_avro::Codec; - use super::*; use crate::TableIdent; use crate::compression::CompressionCodec; @@ -332,8 +330,9 @@ mod tests { current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - Codec::Null, - ); + CompressionCodec::None, + ) + .unwrap(); manifest_list_write .add_manifests(vec![data_file_manifest].into_iter()) .unwrap(); diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 1c74ce7e76..babc5f3820 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -638,8 +638,6 @@ pub mod tests { use tempfile::TempDir; use uuid::Uuid; - use apache_avro::Codec; - use crate::arrow::ArrowReaderBuilder; use crate::compression::CompressionCodec; use crate::expr::{BoundPredicate, Reference}; @@ -930,8 +928,9 @@ pub mod tests { current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - Codec::Null, - ); + CompressionCodec::None, + ) + .unwrap(); manifest_list_write .add_manifests(vec![data_file_manifest].into_iter()) .unwrap(); @@ -1163,8 +1162,9 @@ pub mod tests { current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - Codec::Null, - ); + CompressionCodec::None, + ) + .unwrap(); manifest_list_write .add_manifests(vec![data_file_manifest].into_iter()) .unwrap(); @@ -1257,8 +1257,9 @@ pub mod tests { current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - Codec::Null, - ); + CompressionCodec::None, + ) + .unwrap(); manifest_list_write .add_manifests(vec![data_manifest, delete_manifest].into_iter()) .unwrap(); diff --git a/crates/iceberg/src/spec/manifest_list/mod.rs b/crates/iceberg/src/spec/manifest_list/mod.rs index a673e626af..a1a42a3a39 100644 --- a/crates/iceberg/src/spec/manifest_list/mod.rs +++ b/crates/iceberg/src/spec/manifest_list/mod.rs @@ -99,6 +99,7 @@ mod test { use super::_const_schema::MANIFEST_LIST_AVRO_SCHEMA_V2; use super::_serde::ManifestFileV2; use super::*; + use crate::compression::CompressionCodec; use crate::io::FileIO; use crate::spec::{Datum, FieldSummary, ManifestContentType, ManifestFile}; @@ -138,8 +139,9 @@ mod test { file_io.new_output(full_path.clone()).unwrap(), 1646658105718557341, Some(1646658105718557341), - Codec::Null, - ); + CompressionCodec::None, + ) + .unwrap(); writer .add_manifests(manifest_list.entries.clone().into_iter()) @@ -212,8 +214,9 @@ mod test { 1646658105718557341, Some(1646658105718557341), 1, - Codec::Null, - ); + CompressionCodec::None, + ) + .unwrap(); writer .add_manifests(manifest_list.entries.clone().into_iter()) @@ -327,8 +330,9 @@ mod test { Some(377075049360453639), 1, Some(10), - Codec::Null, - ); + CompressionCodec::None, + ) + .unwrap(); writer .add_manifests(manifest_list.entries.clone().into_iter()) diff --git a/crates/iceberg/src/spec/manifest_list/writer.rs b/crates/iceberg/src/spec/manifest_list/writer.rs index 91f178895b..d8af18d583 100644 --- a/crates/iceberg/src/spec/manifest_list/writer.rs +++ b/crates/iceberg/src/spec/manifest_list/writer.rs @@ -17,7 +17,7 @@ use std::collections::HashMap; -use apache_avro::{Codec, Writer}; +use apache_avro::Writer; use bytes::Bytes; use super::_const_schema::{ @@ -25,8 +25,10 @@ use super::_const_schema::{ }; use super::_serde::{ManifestFileV1, ManifestFileV2, ManifestFileV3}; use super::{FormatVersion, ManifestContentType, ManifestFile, UNASSIGNED_SEQUENCE_NUMBER}; +use crate::compression::CompressionCodec; use crate::error::Result; use crate::io::OutputFile; +use crate::spec::avro_util::to_avro_codec; use crate::{Error, ErrorKind}; /// A manifest list writer. @@ -60,8 +62,8 @@ impl ManifestListWriter { output_file: OutputFile, snapshot_id: i64, parent_snapshot_id: Option, - compression: Codec, - ) -> Self { + compression: CompressionCodec, + ) -> Result { let mut metadata = HashMap::from_iter([ ("snapshot-id".to_string(), snapshot_id.to_string()), ("format-version".to_string(), "1".to_string()), @@ -89,8 +91,8 @@ impl ManifestListWriter { snapshot_id: i64, parent_snapshot_id: Option, sequence_number: i64, - compression: Codec, - ) -> Self { + compression: CompressionCodec, + ) -> Result { let mut metadata = HashMap::from_iter([ ("snapshot-id".to_string(), snapshot_id.to_string()), ("sequence-number".to_string(), sequence_number.to_string()), @@ -120,8 +122,8 @@ impl ManifestListWriter { parent_snapshot_id: Option, sequence_number: i64, first_row_id: Option, // Always None for delete manifests - compression: Codec, - ) -> Self { + compression: CompressionCodec, + ) -> Result { let mut metadata = HashMap::from_iter([ ("snapshot-id".to_string(), snapshot_id.to_string()), ("sequence-number".to_string(), sequence_number.to_string()), @@ -157,27 +159,28 @@ impl ManifestListWriter { sequence_number: i64, snapshot_id: i64, first_row_id: Option, - compression: Codec, - ) -> Self { + compression: CompressionCodec, + ) -> Result { + let codec = to_avro_codec(compression)?; let avro_schema = match format_version { FormatVersion::V1 => &MANIFEST_LIST_AVRO_SCHEMA_V1, FormatVersion::V2 => &MANIFEST_LIST_AVRO_SCHEMA_V2, FormatVersion::V3 => &MANIFEST_LIST_AVRO_SCHEMA_V3, }; - let mut avro_writer = Writer::with_codec(avro_schema, Vec::new(), compression); + let mut avro_writer = Writer::with_codec(avro_schema, Vec::new(), codec); for (key, value) in metadata { avro_writer .add_user_metadata(key, value) .expect("Avro metadata should be added to the writer before the first record."); } - Self { + Ok(Self { format_version, output_file, avro_writer, sequence_number, snapshot_id, next_row_id: first_row_id, - } + }) } /// Append manifests to be written. @@ -335,9 +338,8 @@ mod test { use tempfile::TempDir; - use apache_avro::Codec; - use super::ManifestListWriter; + use crate::compression::CompressionCodec; use crate::io::FileIO; use crate::spec::{Datum, FieldSummary, ManifestContentType, ManifestFile, ManifestList, UNASSIGNED_SEQUENCE_NUMBER}; @@ -371,7 +373,7 @@ mod test { let io = FileIO::new_with_fs(); let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0), Codec::Null); + let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0), CompressionCodec::None).unwrap(); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -418,7 +420,7 @@ mod test { let io = FileIO::new_with_fs(); let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v2(output_file, snapshot_id, Some(0), seq_num, Codec::Null); + let mut writer = ManifestListWriter::v2(output_file, snapshot_id, Some(0), seq_num, CompressionCodec::None).unwrap(); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -467,7 +469,7 @@ mod test { let output_file = output_file(&path, &io); let mut writer = - ManifestListWriter::v3(output_file, snapshot_id, Some(0), seq_num, Some(10), Codec::Null); + ManifestListWriter::v3(output_file, snapshot_id, Some(0), seq_num, Some(10), CompressionCodec::None).unwrap(); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -514,7 +516,7 @@ mod test { let io = FileIO::new_with_fs(); let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0), Codec::Null); + let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0), CompressionCodec::None).unwrap(); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -559,7 +561,7 @@ mod test { let io = FileIO::new_with_fs(); let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0), Codec::Null); + let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0), CompressionCodec::None).unwrap(); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -606,7 +608,7 @@ mod test { let io = FileIO::new_with_fs(); let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v2(output_file, snapshot_id, Some(0), seq_num, Codec::Null); + let mut writer = ManifestListWriter::v2(output_file, snapshot_id, Some(0), seq_num, CompressionCodec::None).unwrap(); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -660,8 +662,9 @@ mod test { snapshot_id, Some(0), seq_num, - Codec::Null, - ); + CompressionCodec::None, + ) + .unwrap(); writer.add_manifests(entries.clone().into_iter()).unwrap(); writer.close().await.unwrap(); let uncompressed_size = fs::metadata(&uncompressed_path).unwrap().len(); @@ -672,8 +675,9 @@ mod test { snapshot_id, Some(0), seq_num, - Codec::Deflate(apache_avro::DeflateSettings::default()), - ); + CompressionCodec::Gzip(Some(9)), + ) + .unwrap(); writer.add_manifests(entries.into_iter()).unwrap(); writer.close().await.unwrap(); let compressed_size = fs::metadata(&compressed_path).unwrap().len(); diff --git a/crates/iceberg/src/transaction/append.rs b/crates/iceberg/src/transaction/append.rs index b2fb0e8846..051d252b92 100644 --- a/crates/iceberg/src/transaction/append.rs +++ b/crates/iceberg/src/transaction/append.rs @@ -158,8 +158,6 @@ mod tests { use tempfile::TempDir; use uuid::Uuid; - use apache_avro::Codec; - use crate::compression::CompressionCodec; use crate::io::FileIO; use crate::spec::{ @@ -310,8 +308,9 @@ mod tests { current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - Codec::Null, - ); + CompressionCodec::None, + ) + .unwrap(); manifest_list_write .add_manifests(vec![data_manifest, delete_manifest].into_iter()) .unwrap(); diff --git a/crates/iceberg/src/transaction/snapshot.rs b/crates/iceberg/src/transaction/snapshot.rs index be888ac410..88f6225897 100644 --- a/crates/iceberg/src/transaction/snapshot.rs +++ b/crates/iceberg/src/transaction/snapshot.rs @@ -25,7 +25,6 @@ use uuid::Uuid; use crate::compression::CompressionCodec; use crate::error::Result; -use crate::spec::avro_util::to_avro_codec; use crate::spec::{ DataFile, DataFileFormat, FormatVersion, MAIN_BRANCH, ManifestContentType, ManifestEntry, ManifestFile, ManifestListWriter, ManifestWriter, ManifestWriterBuilder, Operation, Snapshot, @@ -461,7 +460,7 @@ impl<'a> SnapshotProducer<'a> { .table .file_io() .new_output(manifest_list_path.clone())?; - let compression = to_avro_codec(self.avro_compression_codec()?)?; + let compression = self.avro_compression_codec()?; let mut manifest_list_writer = match self.table.metadata().format_version() { FormatVersion::V1 => ManifestListWriter::v1( @@ -469,14 +468,14 @@ impl<'a> SnapshotProducer<'a> { self.snapshot_id, self.table.metadata().current_snapshot_id(), compression, - ), + )?, FormatVersion::V2 => ManifestListWriter::v2( output_file, self.snapshot_id, self.table.metadata().current_snapshot_id(), next_seq_num, compression, - ), + )?, FormatVersion::V3 => ManifestListWriter::v3( output_file, self.snapshot_id, @@ -484,7 +483,7 @@ impl<'a> SnapshotProducer<'a> { next_seq_num, Some(first_row_id), compression, - ), + )?, }; // Calling self.summary() before self.manifest_file() is important because self.added_data_files From 5dfa4319749dd5c6b72083e321583363978d4798 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 26 Jun 2026 20:47:55 +0000 Subject: [PATCH 31/33] fixes --- crates/iceberg/public-api.txt | 1 - crates/iceberg/src/compression.rs | 5 +- crates/iceberg/src/scan/mod.rs | 7 +-- crates/iceberg/src/spec/avro_util.rs | 3 +- crates/iceberg/src/spec/manifest_list/mod.rs | 1 - .../iceberg/src/spec/manifest_list/writer.rs | 58 ++++++++++++++++--- crates/iceberg/src/spec/table_metadata.rs | 3 +- crates/iceberg/src/spec/table_properties.rs | 2 - crates/iceberg/src/transaction/snapshot.rs | 2 - 9 files changed, 59 insertions(+), 23 deletions(-) diff --git a/crates/iceberg/public-api.txt b/crates/iceberg/public-api.txt index 48446da160..39166d1186 100644 --- a/crates/iceberg/public-api.txt +++ b/crates/iceberg/public-api.txt @@ -2708,7 +2708,6 @@ impl iceberg::spec::TableProperties pub const iceberg::spec::TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC: &str pub const iceberg::spec::TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT: &str pub const iceberg::spec::TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL: &str -pub const iceberg::spec::TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT: core::option::Option pub const iceberg::spec::TableProperties::PROPERTY_COMMIT_MAX_RETRY_WAIT_MS: &str pub const iceberg::spec::TableProperties::PROPERTY_COMMIT_MAX_RETRY_WAIT_MS_DEFAULT: u64 pub const iceberg::spec::TableProperties::PROPERTY_COMMIT_MIN_RETRY_WAIT_MS: &str diff --git a/crates/iceberg/src/compression.rs b/crates/iceberg/src/compression.rs index ef5c05ca77..0f6e73d13c 100644 --- a/crates/iceberg/src/compression.rs +++ b/crates/iceberg/src/compression.rs @@ -131,8 +131,9 @@ impl CompressionCodec { Ok(encoder.finish()?) } CompressionCodec::Gzip(level) => { - let compression = - Compression::new(level.unwrap_or(GZIP_DEFAULT_LEVEL).min(GZIP_MAX_LEVEL) as u32); + let compression = Compression::new( + level.unwrap_or(GZIP_DEFAULT_LEVEL).min(GZIP_MAX_LEVEL) as u32, + ); let mut encoder = GzEncoder::new(Vec::new(), compression); encoder.write_all(&bytes)?; Ok(encoder.finish()?) diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index babc5f3820..f41ba6bebe 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -645,10 +645,9 @@ pub mod tests { use crate::metadata_columns::RESERVED_COL_NAME_FILE; use crate::scan::FileScanTask; use crate::spec::{ - DEFAULT_SCHEMA_NAME_MAPPING, DataContentType, DataFileBuilder, - DataFileFormat, Datum, Literal, ManifestEntry, ManifestListWriter, ManifestStatus, - ManifestWriterBuilder, NestedField, PartitionSpec, PrimitiveType, Schema, Struct, - StructType, TableMetadata, Type, + DEFAULT_SCHEMA_NAME_MAPPING, DataContentType, DataFileBuilder, DataFileFormat, Datum, + Literal, ManifestEntry, ManifestListWriter, ManifestStatus, ManifestWriterBuilder, + NestedField, PartitionSpec, PrimitiveType, Schema, Struct, StructType, TableMetadata, Type, }; use crate::table::Table; use crate::test_utils::test_runtime; diff --git a/crates/iceberg/src/spec/avro_util.rs b/crates/iceberg/src/spec/avro_util.rs index 65a2771648..4d192fd600 100644 --- a/crates/iceberg/src/spec/avro_util.rs +++ b/crates/iceberg/src/spec/avro_util.rs @@ -138,7 +138,8 @@ mod tests { fn test_parse_avro_codec_unknown() { let err = parse_avro_codec(Some("unknown"), Some(1)).unwrap_err(); assert!( - err.to_string().contains("Unsupported compression codec: unknown"), + err.to_string() + .contains("Unsupported compression codec: unknown"), "unexpected error: {err}" ); } diff --git a/crates/iceberg/src/spec/manifest_list/mod.rs b/crates/iceberg/src/spec/manifest_list/mod.rs index a1a42a3a39..82f2b8bdbc 100644 --- a/crates/iceberg/src/spec/manifest_list/mod.rs +++ b/crates/iceberg/src/spec/manifest_list/mod.rs @@ -101,7 +101,6 @@ mod test { use super::*; use crate::compression::CompressionCodec; use crate::io::FileIO; - use crate::spec::{Datum, FieldSummary, ManifestContentType, ManifestFile}; #[tokio::test] diff --git a/crates/iceberg/src/spec/manifest_list/writer.rs b/crates/iceberg/src/spec/manifest_list/writer.rs index d8af18d583..b919c7bdb6 100644 --- a/crates/iceberg/src/spec/manifest_list/writer.rs +++ b/crates/iceberg/src/spec/manifest_list/writer.rs @@ -341,7 +341,10 @@ mod test { use super::ManifestListWriter; use crate::compression::CompressionCodec; use crate::io::FileIO; - use crate::spec::{Datum, FieldSummary, ManifestContentType, ManifestFile, ManifestList, UNASSIGNED_SEQUENCE_NUMBER}; + use crate::spec::{ + Datum, FieldSummary, ManifestContentType, ManifestFile, ManifestList, + UNASSIGNED_SEQUENCE_NUMBER, + }; #[tokio::test] async fn test_manifest_list_writer_v1() { @@ -373,7 +376,13 @@ mod test { let io = FileIO::new_with_fs(); let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0), CompressionCodec::None).unwrap(); + let mut writer = ManifestListWriter::v1( + output_file, + 1646658105718557341, + Some(0), + CompressionCodec::None, + ) + .unwrap(); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -420,7 +429,14 @@ mod test { let io = FileIO::new_with_fs(); let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v2(output_file, snapshot_id, Some(0), seq_num, CompressionCodec::None).unwrap(); + let mut writer = ManifestListWriter::v2( + output_file, + snapshot_id, + Some(0), + seq_num, + CompressionCodec::None, + ) + .unwrap(); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -468,8 +484,15 @@ mod test { let io = FileIO::new_with_fs(); let output_file = output_file(&path, &io); - let mut writer = - ManifestListWriter::v3(output_file, snapshot_id, Some(0), seq_num, Some(10), CompressionCodec::None).unwrap(); + let mut writer = ManifestListWriter::v3( + output_file, + snapshot_id, + Some(0), + seq_num, + Some(10), + CompressionCodec::None, + ) + .unwrap(); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -516,7 +539,13 @@ mod test { let io = FileIO::new_with_fs(); let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0), CompressionCodec::None).unwrap(); + let mut writer = ManifestListWriter::v1( + output_file, + 1646658105718557341, + Some(0), + CompressionCodec::None, + ) + .unwrap(); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -561,7 +590,13 @@ mod test { let io = FileIO::new_with_fs(); let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0), CompressionCodec::None).unwrap(); + let mut writer = ManifestListWriter::v1( + output_file, + 1646658105718557341, + Some(0), + CompressionCodec::None, + ) + .unwrap(); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -608,7 +643,14 @@ mod test { let io = FileIO::new_with_fs(); let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v2(output_file, snapshot_id, Some(0), seq_num, CompressionCodec::None).unwrap(); + let mut writer = ManifestListWriter::v2( + output_file, + snapshot_id, + Some(0), + seq_num, + CompressionCodec::None, + ) + .unwrap(); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index c1f09ac0f5..5c68c28443 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -1614,13 +1614,12 @@ mod tests { use crate::catalog::MetadataLocation; use crate::compression::CompressionCodec; use crate::io::FileIO; - use crate::spec::TableProperties; use crate::spec::table_metadata::TableMetadata; use crate::spec::{ BlobMetadata, EncryptedKey, INITIAL_ROW_ID, Literal, NestedField, NullOrder, Operation, PartitionSpec, PartitionStatisticsFile, PrimitiveLiteral, PrimitiveType, Schema, Snapshot, SnapshotReference, SnapshotRetention, SortDirection, SortField, SortOrder, StatisticsFile, - Summary, Transform, Type, UnboundPartitionField, + Summary, TableProperties, Transform, Type, UnboundPartitionField, }; use crate::{ErrorKind, TableCreation}; diff --git a/crates/iceberg/src/spec/table_properties.rs b/crates/iceberg/src/spec/table_properties.rs index 77ae0ca250..73b47dc4e5 100644 --- a/crates/iceberg/src/spec/table_properties.rs +++ b/crates/iceberg/src/spec/table_properties.rs @@ -239,8 +239,6 @@ impl TableProperties { /// Compression level for Avro files pub const PROPERTY_AVRO_COMPRESSION_LEVEL: &str = "write.avro.compression-level"; - /// Default Avro compression level (None, uses codec-specific defaults: gzip=9, zstd=1) - pub const PROPERTY_AVRO_COMPRESSION_LEVEL_DEFAULT: Option = None; /// Whether to use `FanoutWriter` for partitioned tables (handles unsorted data). /// If false, uses `ClusteredWriter` (requires sorted data, more memory efficient). diff --git a/crates/iceberg/src/transaction/snapshot.rs b/crates/iceberg/src/transaction/snapshot.rs index 88f6225897..b5163600cc 100644 --- a/crates/iceberg/src/transaction/snapshot.rs +++ b/crates/iceberg/src/transaction/snapshot.rs @@ -265,7 +265,6 @@ impl<'a> SnapshotProducer<'a> { DataFileFormat::Avro ); let output_file = self.table.file_io().new_output(new_manifest_path)?; - let builder = ManifestWriterBuilder::new( output_file, Some(self.snapshot_id), @@ -278,7 +277,6 @@ impl<'a> SnapshotProducer<'a> { .clone(), self.avro_compression_codec()?, ); - match self.table.metadata().format_version() { FormatVersion::V1 => Ok(builder.build_v1()), FormatVersion::V2 => match content { From 091e160cf8b7ffea376d78709c0e3504ad06c587 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 31 Mar 2026 18:13:17 +0000 Subject: [PATCH 32/33] simplify test --- crates/iceberg/src/spec/manifest/writer.rs | 197 +++++++++------------ 1 file changed, 85 insertions(+), 112 deletions(-) diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index ce7f5121bf..874047922f 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -760,6 +760,91 @@ mod tests { assert_eq!(actual_manifest, Manifest::new(metadata, entries)); } + #[tokio::test] + async fn test_manifest_writer_with_compression() { + let metadata = { + let schema = Schema::builder() + .with_fields(vec![Arc::new(NestedField::required( + 1, + "id", + Type::Primitive(PrimitiveType::Int), + ))]) + .build() + .unwrap(); + + ManifestMetadata { + schema_id: 0, + schema: Arc::new(schema), + partition_spec: PartitionSpec::unpartition_spec(), + format_version: FormatVersion::V2, + content: ManifestContentType::Data, + } + }; + + async fn write_manifest( + io: &FileIO, + path: &std::path::Path, + metadata: &ManifestMetadata, + compression: CompressionCodec, + ) { + let output_file = io.new_output(path.to_str().unwrap()).unwrap(); + let mut writer = ManifestWriterBuilder::new( + output_file, + Some(1), + None, + metadata.schema.clone(), + metadata.partition_spec.clone(), + compression, + ) + .build_v2_data(); + for i in 0..1000 { + let data_file = DataFileBuilder::default() + .content(DataContentType::Data) + .file_path(format!( + "/very/long/path/to/data/directory/with/many/subdirectories/file_{i}.parquet" + )) + .file_format(DataFileFormat::Parquet) + .partition(Struct::empty()) + .file_size_in_bytes(100000 + i) + .record_count(1000 + i) + .build() + .unwrap(); + let entry = ManifestEntry::builder() + .status(ManifestStatus::Added) + .snapshot_id(1) + .sequence_number(1) + .file_sequence_number(1) + .data_file(data_file) + .build(); + writer.add_entry(entry).unwrap(); + } + writer.write_manifest_file().await.unwrap(); + } + + let tmp_dir = TempDir::new().unwrap(); + let io = FileIO::new_with_fs(); + let uncompressed_path = tmp_dir.path().join("uncompressed_manifest.avro"); + let compressed_path = tmp_dir.path().join("compressed_manifest.avro"); + + write_manifest(&io, &uncompressed_path, &metadata, CompressionCodec::None).await; + write_manifest(&io, &compressed_path, &metadata, CompressionCodec::Gzip(9)).await; + + let uncompressed_size = fs::metadata(&uncompressed_path).unwrap().len(); + let compressed_size = fs::metadata(&compressed_path).unwrap().len(); + + // Verify compression is actually working + assert!( + compressed_size < uncompressed_size, + "Compressed size ({compressed_size}) should be less than uncompressed size ({uncompressed_size})" + ); + + // Verify the compressed file can be read back correctly + let compressed_bytes = fs::read(&compressed_path).unwrap(); + let manifest = Manifest::parse_avro(&compressed_bytes).unwrap(); + assert_eq!(manifest.metadata.format_version, FormatVersion::V2); + assert_eq!(manifest.entries.len(), 1000); + } + #[tokio::test] async fn test_v3_delete_manifest_delete_file_roundtrip() { let schema = Arc::new( @@ -850,116 +935,4 @@ mod tests { ); } - #[tokio::test] - async fn test_manifest_writer_with_compression() { - let metadata = { - let schema = Schema::builder() - .with_fields(vec![Arc::new(NestedField::required( - 1, - "id", - Type::Primitive(PrimitiveType::Int), - ))]) - .build() - .unwrap(); - - ManifestMetadata { - schema_id: 0, - schema: Arc::new(schema), - partition_spec: PartitionSpec::unpartition_spec(), - format_version: FormatVersion::V2, - content: ManifestContentType::Data, - } - }; - - // Write uncompressed manifest with multiple entries to make compression effective - let tmp_dir = TempDir::new().unwrap(); - let uncompressed_path = tmp_dir.path().join("uncompressed_manifest.avro"); - let io = FileIO::new_with_fs(); - let output_file = io.new_output(uncompressed_path.to_str().unwrap()).unwrap(); - let mut writer = ManifestWriterBuilder::new( - output_file, - Some(1), - None, - metadata.schema.clone(), - metadata.partition_spec.clone(), - CompressionCodec::None, - ) - .build_v2_data(); - // Add multiple entries with long paths to create compressible data - for i in 0..1000 { - let data_file = DataFileBuilder::default() - .content(DataContentType::Data) - .file_path(format!( - "/very/long/path/to/data/directory/with/many/subdirectories/file_{i}.parquet" - )) - .file_format(DataFileFormat::Parquet) - .partition(Struct::empty()) - .file_size_in_bytes(100000 + i) - .record_count(1000 + i) - .build() - .unwrap(); - - let entry = ManifestEntry::builder() - .status(ManifestStatus::Added) - .snapshot_id(1) - .sequence_number(1) - .file_sequence_number(1) - .data_file(data_file) - .build(); - writer.add_entry(entry).unwrap(); - } - writer.write_manifest_file().await.unwrap(); - let uncompressed_size = fs::metadata(&uncompressed_path).unwrap().len(); - - // Write compressed manifest with gzip - let compressed_path = tmp_dir.path().join("compressed_manifest.avro"); - let output_file = io.new_output(compressed_path.to_str().unwrap()).unwrap(); - let compression = CompressionCodec::Gzip(Some(9)); - let mut writer = ManifestWriterBuilder::new( - output_file, - Some(1), - None, - metadata.schema.clone(), - metadata.partition_spec.clone(), - compression, - ) - .build_v2_data(); - // Add the same entries with long paths as the uncompressed version - for i in 0..1000 { - let data_file = DataFileBuilder::default() - .content(DataContentType::Data) - .file_path(format!( - "/very/long/path/to/data/directory/with/many/subdirectories/file_{i}.parquet" - )) - .file_format(DataFileFormat::Parquet) - .partition(Struct::empty()) - .file_size_in_bytes(100000 + i) - .record_count(1000 + i) - .build() - .unwrap(); - - let entry = ManifestEntry::builder() - .status(ManifestStatus::Added) - .snapshot_id(1) - .sequence_number(1) - .file_sequence_number(1) - .data_file(data_file) - .build(); - writer.add_entry(entry).unwrap(); - } - writer.write_manifest_file().await.unwrap(); - let compressed_size = fs::metadata(&compressed_path).unwrap().len(); - - // Verify compression is actually working - assert!( - compressed_size < uncompressed_size, - "Compressed size ({compressed_size}) should be less than uncompressed size ({uncompressed_size})" - ); - - // Verify the compressed file can be read back correctly - let compressed_bytes = fs::read(&compressed_path).unwrap(); - let manifest = Manifest::parse_avro(&compressed_bytes).unwrap(); - assert_eq!(manifest.metadata.format_version, FormatVersion::V2); - assert_eq!(manifest.entries.len(), 1000); - } } From ef52ddef54ea1bf04bd8625ac1b65e48079fd8c1 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 26 Jun 2026 21:01:59 +0000 Subject: [PATCH 33/33] fix compression test and add manifest-list round-trip assertion - Fix Gzip(9) -> Gzip(Some(9)) in simplified compression test (rebase reintroduced the pre-Option form) - Add read-back assertion to manifest-list compression test so it verifies the compressed avro is parseable, not just smaller Co-authored-by: Isaac --- crates/iceberg/src/spec/manifest/writer.rs | 2 +- crates/iceberg/src/spec/manifest_list/writer.rs | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index 874047922f..c635f44d88 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -827,7 +827,7 @@ mod tests { let compressed_path = tmp_dir.path().join("compressed_manifest.avro"); write_manifest(&io, &uncompressed_path, &metadata, CompressionCodec::None).await; - write_manifest(&io, &compressed_path, &metadata, CompressionCodec::Gzip(9)).await; + write_manifest(&io, &compressed_path, &metadata, CompressionCodec::Gzip(Some(9))).await; let uncompressed_size = fs::metadata(&uncompressed_path).unwrap().len(); let compressed_size = fs::metadata(&compressed_path).unwrap().len(); diff --git a/crates/iceberg/src/spec/manifest_list/writer.rs b/crates/iceberg/src/spec/manifest_list/writer.rs index b919c7bdb6..a920a83a75 100644 --- a/crates/iceberg/src/spec/manifest_list/writer.rs +++ b/crates/iceberg/src/spec/manifest_list/writer.rs @@ -728,6 +728,12 @@ mod test { compressed_size < uncompressed_size, "compressed size ({compressed_size}) should be less than uncompressed size ({uncompressed_size})" ); + + let compressed_bytes = fs::read(&compressed_path).unwrap(); + let manifest_list = + ManifestList::parse_with_version(&compressed_bytes, crate::spec::FormatVersion::V2) + .unwrap(); + assert_eq!(manifest_list.entries().len(), 1000); } fn output_file(path: &Path, io: &FileIO) -> crate::io::OutputFile {