diff --git a/Cargo.lock b/Cargo.lock index c7c5f6680d..8cc236b5c5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3771,6 +3771,7 @@ dependencies = [ "iceberg_test_utils", "itertools 0.13.0", "minijinja", + "miniz_oxide", "mockall", "moka", "murmur3", @@ -6020,9 +6021,9 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.15" +version = "0.11.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fcb935c5bec503c2f0e306bdd3e58bb9029dcb14fa8d9ac76e3a5256ac0763e" +checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" dependencies = [ "aws-lc-rs", "bytes", diff --git a/Cargo.toml b/Cargo.toml index f11112109a..bfd36642f4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -111,6 +111,7 @@ log = "0.4.28" metainfo = "0.7.14" mimalloc = "0.1.46" minijinja = "2.12.0" +miniz_oxide = "0.8" mockall = "0.13.1" mockito = "1" motore-macros = "0.4.3" diff --git a/crates/iceberg/Cargo.toml b/crates/iceberg/Cargo.toml index 9353a31842..2f8cead346 100644 --- a/crates/iceberg/Cargo.toml +++ b/crates/iceberg/Cargo.toml @@ -59,6 +59,7 @@ flate2 = { workspace = true } fnv = { workspace = true } futures = { workspace = true } itertools = { workspace = true } +miniz_oxide = { workspace = true } moka = { version = "0.12.10", features = ["future"] } murmur3 = { workspace = true } once_cell = { workspace = true } diff --git a/crates/iceberg/public-api.txt b/crates/iceberg/public-api.txt index 653649e6cf..39166d1186 100644 --- a/crates/iceberg/public-api.txt +++ b/crates/iceberg/public-api.txt @@ -141,15 +141,11 @@ pub fn iceberg::cache::ObjectCacheProvide::manifest_list_cache(&self) -> &dyn ic pub type iceberg::cache::ObjectCacheProvider = alloc::sync::Arc pub mod iceberg::compression pub enum iceberg::compression::CompressionCodec -pub iceberg::compression::CompressionCodec::Gzip(u8) +pub iceberg::compression::CompressionCodec::Gzip(core::option::Option) pub iceberg::compression::CompressionCodec::Lz4 pub iceberg::compression::CompressionCodec::None pub iceberg::compression::CompressionCodec::Snappy -pub iceberg::compression::CompressionCodec::Zstd(u8) -impl iceberg::compression::CompressionCodec -pub const fn iceberg::compression::CompressionCodec::gzip_default() -> Self -pub fn iceberg::compression::CompressionCodec::name(&self) -> &'static str -pub const fn iceberg::compression::CompressionCodec::zstd_default() -> Self +pub iceberg::compression::CompressionCodec::Zstd(core::option::Option) impl iceberg::compression::CompressionCodec pub fn iceberg::compression::CompressionCodec::suffix(&self) -> iceberg::Result<&'static str> impl core::clone::Clone for iceberg::compression::CompressionCodec @@ -1121,15 +1117,11 @@ pub fn iceberg::metadata_columns::row_id_field() -> &'static iceberg::spec::Nest pub fn iceberg::metadata_columns::spec_id_field() -> &'static iceberg::spec::NestedFieldRef pub mod iceberg::puffin pub enum iceberg::puffin::CompressionCodec -pub iceberg::puffin::CompressionCodec::Gzip(u8) +pub iceberg::puffin::CompressionCodec::Gzip(core::option::Option) pub iceberg::puffin::CompressionCodec::Lz4 pub iceberg::puffin::CompressionCodec::None pub iceberg::puffin::CompressionCodec::Snappy -pub iceberg::puffin::CompressionCodec::Zstd(u8) -impl iceberg::compression::CompressionCodec -pub const fn iceberg::compression::CompressionCodec::gzip_default() -> Self -pub fn iceberg::compression::CompressionCodec::name(&self) -> &'static str -pub const fn iceberg::compression::CompressionCodec::zstd_default() -> Self +pub iceberg::puffin::CompressionCodec::Zstd(core::option::Option) impl iceberg::compression::CompressionCodec pub fn iceberg::compression::CompressionCodec::suffix(&self) -> iceberg::Result<&'static str> impl core::clone::Clone for iceberg::compression::CompressionCodec @@ -2037,9 +2029,9 @@ impl iceberg::spec::ManifestListWriter pub fn iceberg::spec::ManifestListWriter::add_manifests(&mut self, manifests: impl core::iter::traits::iterator::Iterator) -> iceberg::Result<()> pub async fn iceberg::spec::ManifestListWriter::close(self) -> iceberg::Result<()> pub fn iceberg::spec::ManifestListWriter::next_row_id(&self) -> core::option::Option -pub fn iceberg::spec::ManifestListWriter::v1(writer: alloc::boxed::Box, snapshot_id: i64, parent_snapshot_id: core::option::Option) -> Self -pub fn iceberg::spec::ManifestListWriter::v2(writer: alloc::boxed::Box, snapshot_id: i64, parent_snapshot_id: core::option::Option, sequence_number: i64) -> Self -pub fn iceberg::spec::ManifestListWriter::v3(writer: alloc::boxed::Box, snapshot_id: i64, parent_snapshot_id: core::option::Option, sequence_number: i64, first_row_id: core::option::Option) -> Self +pub fn iceberg::spec::ManifestListWriter::v1(output_file: iceberg::io::OutputFile, snapshot_id: i64, parent_snapshot_id: core::option::Option, compression: iceberg::compression::CompressionCodec) -> iceberg::Result +pub fn iceberg::spec::ManifestListWriter::v2(output_file: iceberg::io::OutputFile, snapshot_id: i64, parent_snapshot_id: core::option::Option, sequence_number: i64, compression: iceberg::compression::CompressionCodec) -> iceberg::Result +pub fn iceberg::spec::ManifestListWriter::v3(output_file: iceberg::io::OutputFile, snapshot_id: i64, parent_snapshot_id: core::option::Option, sequence_number: i64, first_row_id: core::option::Option, compression: iceberg::compression::CompressionCodec) -> iceberg::Result impl core::fmt::Debug for iceberg::spec::ManifestListWriter pub fn iceberg::spec::ManifestListWriter::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result pub struct iceberg::spec::ManifestMetadata @@ -2078,8 +2070,7 @@ pub fn iceberg::spec::ManifestWriterBuilder::build_v2_data(self) -> iceberg::spe pub fn iceberg::spec::ManifestWriterBuilder::build_v2_deletes(self) -> iceberg::spec::ManifestWriter pub fn iceberg::spec::ManifestWriterBuilder::build_v3_data(self) -> iceberg::spec::ManifestWriter pub fn iceberg::spec::ManifestWriterBuilder::build_v3_deletes(self) -> iceberg::spec::ManifestWriter -pub fn iceberg::spec::ManifestWriterBuilder::new(output: iceberg::io::OutputFile, snapshot_id: core::option::Option, key_metadata: core::option::Option>, schema: iceberg::spec::SchemaRef, partition_spec: iceberg::spec::PartitionSpec) -> Self -pub fn iceberg::spec::ManifestWriterBuilder::new_from_encrypted(encrypted_output: iceberg::encryption::EncryptedOutputFile, snapshot_id: core::option::Option, key_metadata: core::option::Option>, schema: iceberg::spec::SchemaRef, partition_spec: iceberg::spec::PartitionSpec) -> Self +pub fn iceberg::spec::ManifestWriterBuilder::new(output: iceberg::io::OutputFile, snapshot_id: core::option::Option, key_metadata: core::option::Option>, schema: iceberg::spec::SchemaRef, partition_spec: iceberg::spec::PartitionSpec, compression: iceberg::compression::CompressionCodec) -> Self pub struct iceberg::spec::Map impl iceberg::spec::Map pub fn iceberg::spec::Map::get(&self, key: &iceberg::spec::Literal) -> core::option::Option<&core::option::Option> @@ -2694,6 +2685,7 @@ pub fn iceberg::spec::TableMetadataBuilder::clone(&self) -> iceberg::spec::Table impl core::fmt::Debug for iceberg::spec::TableMetadataBuilder pub fn iceberg::spec::TableMetadataBuilder::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result pub struct iceberg::spec::TableProperties +pub iceberg::spec::TableProperties::avro_compression_codec: iceberg::compression::CompressionCodec pub iceberg::spec::TableProperties::cdc_enabled: bool pub iceberg::spec::TableProperties::cdc_max_chunk_size: usize pub iceberg::spec::TableProperties::cdc_min_chunk_size: usize @@ -2713,6 +2705,9 @@ pub iceberg::spec::TableProperties::write_datafusion_fanout_enabled: bool pub iceberg::spec::TableProperties::write_format_default: alloc::string::String pub iceberg::spec::TableProperties::write_target_file_size_bytes: usize impl iceberg::spec::TableProperties +pub const iceberg::spec::TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC: &str +pub const iceberg::spec::TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT: &str +pub const iceberg::spec::TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL: &str pub const iceberg::spec::TableProperties::PROPERTY_COMMIT_MAX_RETRY_WAIT_MS: &str pub const iceberg::spec::TableProperties::PROPERTY_COMMIT_MAX_RETRY_WAIT_MS_DEFAULT: u64 pub const iceberg::spec::TableProperties::PROPERTY_COMMIT_MIN_RETRY_WAIT_MS: &str diff --git a/crates/iceberg/src/catalog/metadata_location.rs b/crates/iceberg/src/catalog/metadata_location.rs index acd041d5e1..73333c4f42 100644 --- a/crates/iceberg/src/catalog/metadata_location.rs +++ b/crates/iceberg/src/catalog/metadata_location.rs @@ -114,9 +114,9 @@ impl MetadataLocation { ))?; // Check for compression suffix (e.g., .gz) - let gzip_suffix = CompressionCodec::gzip_default().suffix()?; + let gzip_suffix = CompressionCodec::Gzip(None).suffix()?; let (stripped, compression_codec) = if let Some(s) = stripped.strip_suffix(gzip_suffix) { - (s, CompressionCodec::gzip_default()) + (s, CompressionCodec::Gzip(None)) } else { (stripped, CompressionCodec::None) }; @@ -261,7 +261,7 @@ mod test { table_location: "/abc".to_string(), version: 1234567, id: Uuid::from_str("2cd22b57-5127-4198-92ba-e4e67c79821b").unwrap(), - compression_codec: CompressionCodec::gzip_default(), + compression_codec: CompressionCodec::Gzip(None), }), ), // Negative version @@ -347,14 +347,11 @@ mod test { .unwrap(); assert_eq!( location_gzip.compression_codec, - CompressionCodec::gzip_default() + CompressionCodec::Gzip(None) ); let next_gzip = location_gzip.with_next_version(); - assert_eq!( - next_gzip.compression_codec, - CompressionCodec::gzip_default() - ); + assert_eq!(next_gzip.compression_codec, CompressionCodec::Gzip(None)); assert_eq!(next_gzip.version, 6); } @@ -375,10 +372,7 @@ mod test { ); let metadata_gzip = create_test_metadata(props_gzip); let updated_gzip = location.with_new_metadata(&metadata_gzip); - assert_eq!( - updated_gzip.compression_codec, - CompressionCodec::gzip_default() - ); + assert_eq!(updated_gzip.compression_codec, CompressionCodec::Gzip(None)); assert_eq!(updated_gzip.version, 0); assert_eq!( updated_gzip.to_string(), diff --git a/crates/iceberg/src/compression.rs b/crates/iceberg/src/compression.rs index 929d9226e7..0f6e73d13c 100644 --- a/crates/iceberg/src/compression.rs +++ b/crates/iceberg/src/compression.rs @@ -27,11 +27,10 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer}; use crate::{Error, ErrorKind, Result}; -/// Default compression level for Zstandard (zstd). +// General-purpose defaults for data-file compression. Avro manifest defaults are different +// (gzip=9, zstd=1) and are defined in avro_util.rs to match the Java implementation. const ZSTD_DEFAULT_LEVEL: u8 = 3; -/// Default compression level for Gzip. const GZIP_DEFAULT_LEVEL: u8 = 6; -/// Maximum compression level for Gzip. const GZIP_MAX_LEVEL: u8 = 9; /// Data compression formats @@ -42,47 +41,24 @@ pub enum CompressionCodec { None, /// LZ4 single compression frame with content size present Lz4, - /// Zstandard single compression frame with content size present. - /// Level range is 0–22, where 0 means default compression level (not no compression). - /// Use [`CompressionCodec::zstd_default`] to construct with the default level. - Zstd(u8), - /// Gzip compression. Level range is 0–9, where 0 means no compression. - /// Use [`CompressionCodec::gzip_default`] to construct with the default level. - Gzip(u8), + /// Zstandard single compression frame with content size present. Optional level 0–22. + Zstd(Option), + /// Gzip compression. Optional level 0–9. + Gzip(Option), /// Snappy compression Snappy, } -impl CompressionCodec { - /// Returns a Zstd codec with the default compression level. - pub const fn zstd_default() -> Self { - CompressionCodec::Zstd(ZSTD_DEFAULT_LEVEL) - } - - /// Returns a Gzip codec with the default compression level. - pub const fn gzip_default() -> Self { - CompressionCodec::Gzip(GZIP_DEFAULT_LEVEL) - } - - /// Returns the codec name as used in serialization and error messages. - pub fn name(&self) -> &'static str { - match self { +impl Serialize for CompressionCodec { + fn serialize(&self, serializer: S) -> std::result::Result { + let s = match self { CompressionCodec::None => "none", CompressionCodec::Lz4 => "lz4", CompressionCodec::Zstd(_) => "zstd", CompressionCodec::Gzip(_) => "gzip", CompressionCodec::Snappy => "snappy", - } - } -} - -// Note: serialize/deserialize do not round-trip the compression level. Iceberg configuration -// only the codec name (e.g. "zstd"), not the level, so deserialization always produces the -// default level. A `Zstd(5)` written to metadata will be read back as `Zstd(3)`. Some -// compression configuration (e.g. Avro metadata) has a separate level field alongside the codec name. -impl Serialize for CompressionCodec { - fn serialize(&self, serializer: S) -> std::result::Result { - serializer.serialize_str(self.name()) + }; + serializer.serialize_str(s) } } @@ -92,8 +68,8 @@ impl<'de> Deserialize<'de> for CompressionCodec { match s.to_lowercase().as_str() { "none" => Ok(CompressionCodec::None), "lz4" => Ok(CompressionCodec::Lz4), - "zstd" => Ok(CompressionCodec::zstd_default()), - "gzip" => Ok(CompressionCodec::gzip_default()), + "zstd" => Ok(CompressionCodec::Zstd(None)), + "gzip" => Ok(CompressionCodec::Gzip(None)), "snappy" => Ok(CompressionCodec::Snappy), other => Err(serde::de::Error::unknown_variant(other, &[ "none", "lz4", "zstd", "gzip", "snappy", @@ -107,8 +83,10 @@ impl fmt::Display for CompressionCodec { match self { CompressionCodec::None => write!(f, "None"), CompressionCodec::Lz4 => write!(f, "Lz4"), - CompressionCodec::Zstd(level) => write!(f, "Zstd(level={level})"), - CompressionCodec::Gzip(level) => write!(f, "Gzip(level={level})"), + CompressionCodec::Zstd(Some(level)) => write!(f, "Zstd(level={level})"), + CompressionCodec::Zstd(None) => write!(f, "Zstd"), + CompressionCodec::Gzip(Some(level)) => write!(f, "Gzip(level={level})"), + CompressionCodec::Gzip(None) => write!(f, "Gzip"), CompressionCodec::Snappy => write!(f, "Snappy"), } } @@ -145,14 +123,17 @@ impl CompressionCodec { )), CompressionCodec::Zstd(level) => { let writer = Vec::::new(); - let mut encoder = zstd::stream::Encoder::new(writer, *level as i32)?; + let mut encoder = + zstd::stream::Encoder::new(writer, level.unwrap_or(ZSTD_DEFAULT_LEVEL) as i32)?; encoder.include_checksum(true)?; encoder.set_pledged_src_size(Some(bytes.len().try_into()?))?; std::io::copy(&mut &bytes[..], &mut encoder)?; Ok(encoder.finish()?) } CompressionCodec::Gzip(level) => { - let compression = Compression::new((*level).min(GZIP_MAX_LEVEL) as u32); + let compression = Compression::new( + level.unwrap_or(GZIP_DEFAULT_LEVEL).min(GZIP_MAX_LEVEL) as u32, + ); let mut encoder = GzEncoder::new(Vec::new(), compression); encoder.write_all(&bytes)?; Ok(encoder.finish()?) @@ -164,6 +145,16 @@ impl CompressionCodec { } } + /// Replace the compression level, if this variant carries one. + /// Variants without a level (`None`, `Lz4`, `Snappy`) are returned unchanged. + pub(crate) fn with_level(self, level: Option) -> Self { + match self { + CompressionCodec::Gzip(_) => CompressionCodec::Gzip(level), + CompressionCodec::Zstd(_) => CompressionCodec::Zstd(level), + other => other, + } + } + pub(crate) fn is_none(&self) -> bool { matches!(self, CompressionCodec::None) } @@ -173,7 +164,7 @@ impl CompressionCodec { /// /// # Errors /// - /// Returns an error for Lz4 and Zstd as they are not fully supported. + /// Returns an error for Lz4, Zstd, and Snappy as they are not fully supported. pub fn suffix(&self) -> Result<&'static str> { match self { CompressionCodec::None => Ok(""), @@ -207,10 +198,7 @@ mod tests { async fn test_compression_codec_compress() { let bytes_vec = [0_u8; 100].to_vec(); - let compression_codecs = [ - CompressionCodec::zstd_default(), - CompressionCodec::gzip_default(), - ]; + let compression_codecs = [CompressionCodec::Zstd(None), CompressionCodec::Gzip(None)]; for codec in compression_codecs { let compressed = codec.compress(bytes_vec.clone()).unwrap(); @@ -244,33 +232,44 @@ mod tests { #[test] fn test_suffix() { assert_eq!(CompressionCodec::None.suffix().unwrap(), ""); - assert_eq!(CompressionCodec::gzip_default().suffix().unwrap(), ".gz"); + assert_eq!(CompressionCodec::Gzip(None).suffix().unwrap(), ".gz"); assert!(CompressionCodec::Lz4.suffix().is_err()); - assert!(CompressionCodec::zstd_default().suffix().is_err()); + assert!(CompressionCodec::Zstd(None).suffix().is_err()); assert!(CompressionCodec::Snappy.suffix().is_err()); let lz4_err = CompressionCodec::Lz4.suffix().unwrap_err(); assert!(lz4_err.to_string().contains("suffix not defined for Lz4")); - let zstd_err = CompressionCodec::zstd_default().suffix().unwrap_err(); + let zstd_err = CompressionCodec::Zstd(None).suffix().unwrap_err(); assert!(zstd_err.to_string().contains("suffix not defined for Zstd")); } #[test] - fn test_display() { - assert_eq!(CompressionCodec::None.to_string(), "None"); - assert_eq!(CompressionCodec::Lz4.to_string(), "Lz4"); + fn test_with_level_ignores_level_for_level_less_variants() { + // Snappy and Lz4 don't carry a level; with_level must return them unchanged. assert_eq!( - CompressionCodec::zstd_default().to_string(), - "Zstd(level=3)" + CompressionCodec::Snappy.with_level(Some(5)), + CompressionCodec::Snappy ); - assert_eq!(CompressionCodec::Zstd(5).to_string(), "Zstd(level=5)"); assert_eq!( - CompressionCodec::gzip_default().to_string(), - "Gzip(level=6)" + CompressionCodec::Lz4.with_level(Some(5)), + CompressionCodec::Lz4 ); - assert_eq!(CompressionCodec::Gzip(9).to_string(), "Gzip(level=9)"); + assert_eq!( + CompressionCodec::None.with_level(Some(5)), + CompressionCodec::None + ); + } + + #[test] + fn test_display() { + assert_eq!(CompressionCodec::None.to_string(), "None"); + assert_eq!(CompressionCodec::Lz4.to_string(), "Lz4"); + assert_eq!(CompressionCodec::Zstd(None).to_string(), "Zstd"); + assert_eq!(CompressionCodec::Zstd(Some(5)).to_string(), "Zstd(level=5)"); + assert_eq!(CompressionCodec::Gzip(None).to_string(), "Gzip"); + assert_eq!(CompressionCodec::Gzip(Some(9)).to_string(), "Gzip(level=9)"); assert_eq!(CompressionCodec::Snappy.to_string(), "Snappy"); } } diff --git a/crates/iceberg/src/io/object_cache.rs b/crates/iceberg/src/io/object_cache.rs index d724c6ecda..51800888dd 100644 --- a/crates/iceberg/src/io/object_cache.rs +++ b/crates/iceberg/src/io/object_cache.rs @@ -214,6 +214,7 @@ mod tests { use super::*; use crate::TableIdent; + use crate::compression::CompressionCodec; use crate::io::{FileIO, OutputFile}; use crate::spec::{ DataContentType, DataFileBuilder, DataFileFormat, Literal, ManifestEntry, @@ -296,6 +297,7 @@ mod tests { None, current_schema.clone(), current_partition_spec.as_ref().clone(), + CompressionCodec::None, ) .build_v2_data(); writer @@ -320,20 +322,17 @@ mod tests { let data_file_manifest = writer.write_manifest_file().await.unwrap(); // Write to manifest list - let manifest_list_writer = self - .table - .file_io() - .new_output(current_snapshot.manifest_list()) - .unwrap() - .writer() - .await - .unwrap(); let mut manifest_list_write = ManifestListWriter::v2( - manifest_list_writer, + self.table + .file_io() + .new_output(current_snapshot.manifest_list()) + .unwrap(), current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - ); + CompressionCodec::None, + ) + .unwrap(); manifest_list_write .add_manifests(vec![data_file_manifest].into_iter()) .unwrap(); diff --git a/crates/iceberg/src/puffin/metadata.rs b/crates/iceberg/src/puffin/metadata.rs index 10cfd265da..4c09a07876 100644 --- a/crates/iceberg/src/puffin/metadata.rs +++ b/crates/iceberg/src/puffin/metadata.rs @@ -1024,7 +1024,7 @@ mod tests { assert_eq!(metadata.blobs.len(), 1); assert_eq!( metadata.blobs[0].compression_codec, - CompressionCodec::gzip_default() + CompressionCodec::Gzip(None) ); } } diff --git a/crates/iceberg/src/puffin/mod.rs b/crates/iceberg/src/puffin/mod.rs index 0e054cac51..12f2aef2a1 100644 --- a/crates/iceberg/src/puffin/mod.rs +++ b/crates/iceberg/src/puffin/mod.rs @@ -34,11 +34,7 @@ fn validate_puffin_compression(codec: CompressionCodec) -> Result<()> { other => Err(Error::new( ErrorKind::DataInvalid, format!( - "Compression codec {} is not supported for Puffin files. Only {}, {}, and {} are supported.", - other.name(), - CompressionCodec::None.name(), - CompressionCodec::Lz4.name(), - CompressionCodec::zstd_default().name() + "{other:?} is not supported for Puffin files. Only None, Lz4, and Zstd are supported." ), )), } @@ -65,10 +61,11 @@ mod tests { // Supported codecs assert!(validate_puffin_compression(CompressionCodec::None).is_ok()); assert!(validate_puffin_compression(CompressionCodec::Lz4).is_ok()); - assert!(validate_puffin_compression(CompressionCodec::zstd_default()).is_ok()); - assert!(validate_puffin_compression(CompressionCodec::Zstd(5)).is_ok()); + assert!(validate_puffin_compression(CompressionCodec::Zstd(None)).is_ok()); + assert!(validate_puffin_compression(CompressionCodec::Zstd(Some(3))).is_ok()); // Unsupported codecs - assert!(validate_puffin_compression(CompressionCodec::gzip_default()).is_err()); + assert!(validate_puffin_compression(CompressionCodec::Gzip(None)).is_err()); + assert!(validate_puffin_compression(CompressionCodec::Snappy).is_err()); } } diff --git a/crates/iceberg/src/puffin/reader.rs b/crates/iceberg/src/puffin/reader.rs index 0aced4186f..3e8e3294bf 100644 --- a/crates/iceberg/src/puffin/reader.rs +++ b/crates/iceberg/src/puffin/reader.rs @@ -144,7 +144,7 @@ mod tests { sequence_number: 1, offset: 4, length: 10, - compression_codec: CompressionCodec::gzip_default(), + compression_codec: CompressionCodec::Gzip(None), properties: HashMap::new(), }; @@ -153,7 +153,7 @@ mod tests { assert!(result.is_err()); let err = result.unwrap_err(); assert_eq!(err.kind(), ErrorKind::DataInvalid); - assert!(err.to_string().contains("gzip")); + assert!(err.to_string().to_lowercase().contains("gzip")); assert!( err.to_string() .contains("is not supported for Puffin files") diff --git a/crates/iceberg/src/puffin/test_utils.rs b/crates/iceberg/src/puffin/test_utils.rs index e0844e2002..21e132b2f7 100644 --- a/crates/iceberg/src/puffin/test_utils.rs +++ b/crates/iceberg/src/puffin/test_utils.rs @@ -77,7 +77,7 @@ pub(crate) fn zstd_compressed_metric_blob_0_metadata() -> BlobMetadata { sequence_number: METRIC_BLOB_0_SEQUENCE_NUMBER, offset: 4, length: 22, - compression_codec: CompressionCodec::zstd_default(), + compression_codec: CompressionCodec::Zstd(None), properties: HashMap::new(), } } @@ -134,7 +134,7 @@ pub(crate) fn zstd_compressed_metric_blob_1_metadata() -> BlobMetadata { sequence_number: METRIC_BLOB_1_SEQUENCE_NUMBER, offset: 26, length: 77, - compression_codec: CompressionCodec::zstd_default(), + compression_codec: CompressionCodec::Zstd(None), properties: HashMap::new(), } } diff --git a/crates/iceberg/src/puffin/writer.rs b/crates/iceberg/src/puffin/writer.rs index 4af4970b04..70584b435c 100644 --- a/crates/iceberg/src/puffin/writer.rs +++ b/crates/iceberg/src/puffin/writer.rs @@ -252,7 +252,7 @@ mod tests { let temp_dir = TempDir::new().unwrap(); let blobs = vec![blob_0(), blob_1()]; let blobs_with_compression = - blobs_with_compression(blobs.clone(), CompressionCodec::zstd_default()); + blobs_with_compression(blobs.clone(), CompressionCodec::Zstd(None)); let input_file = write_puffin_file(&temp_dir, blobs_with_compression, file_properties()) .await @@ -324,8 +324,7 @@ mod tests { async fn test_zstd_compressed_metric_data_is_bit_identical_to_java_generated_file() { let temp_dir = TempDir::new().unwrap(); let blobs = vec![blob_0(), blob_1()]; - let blobs_with_compression = - blobs_with_compression(blobs, CompressionCodec::zstd_default()); + let blobs_with_compression = blobs_with_compression(blobs, CompressionCodec::Zstd(None)); assert_files_are_bit_identical( write_puffin_file(&temp_dir, blobs_with_compression, file_properties()) @@ -340,15 +339,14 @@ mod tests { async fn test_gzip_compression_rejected() { let temp_dir = TempDir::new().unwrap(); let blobs = vec![blob_0()]; - let blobs_with_compression = - blobs_with_compression(blobs, CompressionCodec::gzip_default()); + let blobs_with_compression = blobs_with_compression(blobs, CompressionCodec::Gzip(None)); let result = write_puffin_file(&temp_dir, blobs_with_compression, file_properties()).await; assert!(result.is_err()); let err = result.unwrap_err(); assert_eq!(err.kind(), ErrorKind::DataInvalid); - assert!(err.to_string().contains("gzip")); + assert!(err.to_string().to_lowercase().contains("gzip")); assert!( err.to_string() .contains("is not supported for Puffin files") diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 86092313c6..f41ba6bebe 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -619,7 +619,7 @@ pub mod tests { //! shared tests for the table scan API #![allow(missing_docs)] - use std::collections::HashMap; + use std::collections::{HashMap, HashSet}; use std::fs; use std::fs::File; use std::sync::Arc; @@ -639,6 +639,7 @@ pub mod tests { use uuid::Uuid; use crate::arrow::ArrowReaderBuilder; + use crate::compression::CompressionCodec; use crate::expr::{BoundPredicate, Reference}; use crate::io::{FileIO, OutputFile}; use crate::metadata_columns::RESERVED_COL_NAME_FILE; @@ -848,6 +849,7 @@ pub mod tests { None, current_schema.clone(), current_partition_spec.as_ref().clone(), + CompressionCodec::None, ) .build_v2_data(); writer @@ -917,20 +919,17 @@ pub mod tests { let data_file_manifest = writer.write_manifest_file().await.unwrap(); // Write to manifest list - let manifest_list_writer = self - .table - .file_io() - .new_output(current_snapshot.manifest_list()) - .unwrap() - .writer() - .await - .unwrap(); let mut manifest_list_write = ManifestListWriter::v2( - manifest_list_writer, + self.table + .file_io() + .new_output(current_snapshot.manifest_list()) + .unwrap(), current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - ); + CompressionCodec::None, + ) + .unwrap(); manifest_list_write .add_manifests(vec![data_file_manifest].into_iter()) .unwrap(); @@ -1077,6 +1076,7 @@ pub mod tests { None, current_schema.clone(), current_partition_spec.as_ref().clone(), + CompressionCodec::None, ) .build_v2_data(); @@ -1153,20 +1153,17 @@ pub mod tests { let data_file_manifest = writer.write_manifest_file().await.unwrap(); // Write to manifest list - let manifest_list_writer = self - .table - .file_io() - .new_output(current_snapshot.manifest_list()) - .unwrap() - .writer() - .await - .unwrap(); let mut manifest_list_write = ManifestListWriter::v2( - manifest_list_writer, + self.table + .file_io() + .new_output(current_snapshot.manifest_list()) + .unwrap(), current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - ); + CompressionCodec::None, + ) + .unwrap(); manifest_list_write .add_manifests(vec![data_file_manifest].into_iter()) .unwrap(); @@ -1188,6 +1185,7 @@ pub mod tests { None, current_schema.clone(), current_partition_spec.as_ref().clone(), + CompressionCodec::None, ) .build_v2_data(); @@ -1223,6 +1221,7 @@ pub mod tests { None, current_schema.clone(), current_partition_spec.as_ref().clone(), + CompressionCodec::None, ) .build_v2_deletes(); @@ -1249,20 +1248,17 @@ pub mod tests { // Write to manifest list - DATA FIRST then DELETE // This order is crucial for reproduction - let manifest_list_writer = self - .table - .file_io() - .new_output(current_snapshot.manifest_list()) - .unwrap() - .writer() - .await - .unwrap(); let mut manifest_list_write = ManifestListWriter::v2( - manifest_list_writer, + self.table + .file_io() + .new_output(current_snapshot.manifest_list()) + .unwrap(), current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - ); + CompressionCodec::None, + ) + .unwrap(); manifest_list_write .add_manifests(vec![data_manifest, delete_manifest].into_iter()) .unwrap(); @@ -2025,8 +2021,6 @@ pub mod tests { #[tokio::test] async fn test_select_with_file_column() { - use arrow_array::cast::AsArray; - let mut fixture = TableTestFixture::new(); fixture.setup_manifest_files().await; @@ -2148,8 +2142,6 @@ pub mod tests { #[tokio::test] async fn test_file_column_with_multiple_files() { - use std::collections::HashSet; - let mut fixture = TableTestFixture::new(); fixture.setup_manifest_files().await; diff --git a/crates/iceberg/src/spec/avro_util.rs b/crates/iceberg/src/spec/avro_util.rs new file mode 100644 index 0000000000..4d192fd600 --- /dev/null +++ b/crates/iceberg/src/spec/avro_util.rs @@ -0,0 +1,221 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Utilities for working with Apache Avro in Iceberg. + +use apache_avro::{Codec, DeflateSettings, ZstandardSettings}; +use miniz_oxide::deflate::CompressionLevel; +use serde_json::Value; + +use crate::compression::CompressionCodec; +use crate::error::Result; +use crate::{Error, ErrorKind}; + +/// Codec name for uncompressed +pub const CODEC_UNCOMPRESSED: &str = "uncompressed"; + +// These Avro-specific defaults match the Java Iceberg implementation (GzipCodec/ZstandardCodec). +// They differ intentionally from the general-purpose defaults in compression.rs (gzip=6, zstd=3), +// which are used for data-file compression and are not Java-compatible by design. +const DEFAULT_GZIP_LEVEL: u8 = 9; +const DEFAULT_ZSTD_LEVEL: u8 = 1; +/// Max supported level for ZSTD +const MAX_ZSTD_LEVEL: u8 = 22; + +/// Parse codec name and optional level into a [`CompressionCodec`]. +/// +/// Returns `DataInvalid` for unrecognized codec names, matching the Java implementation which +/// throws `IllegalArgumentException("Unsupported compression codec: ...")` in the same case. +/// +/// # Arguments +/// +/// * `codec` - The name of the compression codec (e.g., "gzip", "zstd", "snappy", "uncompressed") +/// * `level` - Optional compression level stored as-is; codec-specific defaults are applied +/// in [`to_avro_codec`] at the point of use. +pub(crate) fn parse_avro_codec(codec: Option<&str>, level: Option) -> Result { + let Some(codec_str) = codec else { + return Ok(CompressionCodec::None); + }; + let lowercase = codec_str.to_lowercase(); + // "uncompressed" is Avro-specific and not in the standard CompressionCodec names + if lowercase == CODEC_UNCOMPRESSED { + return Ok(CompressionCodec::None); + } + let base: CompressionCodec = + serde_json::from_value(Value::String(lowercase)).map_err(|_| { + Error::new( + ErrorKind::DataInvalid, + format!("Unsupported compression codec: {codec_str}"), + ) + })?; + Ok(base.with_level(level)) +} + +/// Convert a [`CompressionCodec`] to an [`apache_avro::Codec`] for use in Avro writers. +/// Codec-specific defaults are applied here (e.g. gzip defaults to level 9, zstd to level 1). +/// Returns an error for codecs that are not supported in Avro (e.g. LZ4). +pub(crate) fn to_avro_codec(codec: CompressionCodec) -> Result { + match codec { + CompressionCodec::None => Ok(Codec::Null), + CompressionCodec::Snappy => Ok(Codec::Snappy), + CompressionCodec::Lz4 => Err(Error::new( + ErrorKind::FeatureUnsupported, + "LZ4 compression is not supported for Avro files", + )), + CompressionCodec::Gzip(level) => { + // miniz_oxide::CompressionLevel only exposes a handful of named levels, so levels + // 2–8 and 11+ all map to DefaultLevel. This is a library granularity limitation, + // not a bug — the output is valid gzip at whatever level miniz chooses. + let compression_level = match level.unwrap_or(DEFAULT_GZIP_LEVEL) { + 0 => CompressionLevel::NoCompression, + 1 => CompressionLevel::BestSpeed, + 9 => CompressionLevel::BestCompression, + 10 => CompressionLevel::UberCompression, + _ => CompressionLevel::DefaultLevel, + }; + Ok(Codec::Deflate(DeflateSettings::new(compression_level))) + } + CompressionCodec::Zstd(level) => { + let zstd_level = level.unwrap_or(DEFAULT_ZSTD_LEVEL).min(MAX_ZSTD_LEVEL); + Ok(Codec::Zstandard(ZstandardSettings::new(zstd_level))) + } + } +} + +#[cfg(test)] +mod tests { + use apache_avro::{DeflateSettings, ZstandardSettings}; + use miniz_oxide::deflate::CompressionLevel; + + use super::*; + + #[test] + fn test_parse_avro_codec_gzip() { + // Test with mixed case to verify case-insensitive matching + let codec = parse_avro_codec(Some("GZip"), Some(5)).unwrap(); + assert_eq!(codec, CompressionCodec::Gzip(Some(5))); + } + + #[test] + fn test_parse_avro_codec_snappy() { + let codec = parse_avro_codec(Some("snappy"), None).unwrap(); + assert_eq!(codec, CompressionCodec::Snappy); + } + + #[test] + fn test_parse_avro_codec_zstd() { + let codec = parse_avro_codec(Some("zstd"), Some(3)).unwrap(); + assert_eq!(codec, CompressionCodec::Zstd(Some(3))); + } + + #[test] + fn test_parse_avro_codec_uncompressed() { + let codec = parse_avro_codec(Some("uncompressed"), None).unwrap(); + assert_eq!(codec, CompressionCodec::None); + } + + #[test] + fn test_parse_avro_codec_null() { + let codec = parse_avro_codec(None, None).unwrap(); + assert_eq!(codec, CompressionCodec::None); + } + + #[test] + fn test_parse_avro_codec_unknown() { + let err = parse_avro_codec(Some("unknown"), Some(1)).unwrap_err(); + assert!( + err.to_string() + .contains("Unsupported compression codec: unknown"), + "unexpected error: {err}" + ); + } + + #[test] + fn test_parse_avro_codec_gzip_no_level() { + // Level is stored as-is (None), default applied in to_avro_codec + let codec = parse_avro_codec(Some("gzip"), None).unwrap(); + assert_eq!(codec, CompressionCodec::Gzip(None)); + } + + #[test] + fn test_parse_avro_codec_zstd_no_level() { + let codec = parse_avro_codec(Some("zstd"), None).unwrap(); + assert_eq!(codec, CompressionCodec::Zstd(None)); + } + + #[test] + fn test_to_avro_codec_gzip_default() { + // None level → default 9 (BestCompression) + let avro_codec = to_avro_codec(CompressionCodec::Gzip(None)).unwrap(); + assert_eq!( + avro_codec, + Codec::Deflate(DeflateSettings::new(CompressionLevel::BestCompression)) + ); + } + + #[test] + fn test_to_avro_codec_gzip_level5() { + let avro_codec = to_avro_codec(CompressionCodec::Gzip(Some(5))).unwrap(); + assert_eq!( + avro_codec, + Codec::Deflate(DeflateSettings::new(CompressionLevel::DefaultLevel)) + ); + } + + #[test] + fn test_to_avro_codec_zstd_default() { + // None level → default 1 + let avro_codec = to_avro_codec(CompressionCodec::Zstd(None)).unwrap(); + assert_eq!(avro_codec, Codec::Zstandard(ZstandardSettings::new(1))); + } + + #[test] + fn test_to_avro_codec_zstd_level3() { + let avro_codec = to_avro_codec(CompressionCodec::Zstd(Some(3))).unwrap(); + assert_eq!(avro_codec, Codec::Zstandard(ZstandardSettings::new(3))); + } + + #[test] + fn test_to_avro_codec_zstd_clamping() { + let avro_codec = to_avro_codec(CompressionCodec::Zstd(Some(MAX_ZSTD_LEVEL + 1))).unwrap(); + assert_eq!( + avro_codec, + Codec::Zstandard(ZstandardSettings::new(MAX_ZSTD_LEVEL)) + ); + } + + #[test] + fn test_to_avro_codec_null() { + assert!(matches!( + to_avro_codec(CompressionCodec::None).unwrap(), + Codec::Null + )); + } + + #[test] + fn test_to_avro_codec_snappy() { + assert!(matches!( + to_avro_codec(CompressionCodec::Snappy).unwrap(), + Codec::Snappy + )); + } + + #[test] + fn test_to_avro_codec_lz4_unsupported() { + assert!(to_avro_codec(CompressionCodec::Lz4).is_err()); + } +} diff --git a/crates/iceberg/src/spec/manifest/mod.rs b/crates/iceberg/src/spec/manifest/mod.rs index fca753ecb2..098e70f7ea 100644 --- a/crates/iceberg/src/spec/manifest/mod.rs +++ b/crates/iceberg/src/spec/manifest/mod.rs @@ -166,6 +166,7 @@ mod tests { use tempfile::TempDir; use super::*; + use crate::compression::CompressionCodec; use crate::io::FileIO; use crate::spec::{Literal, NestedField, PrimitiveType, Struct, Transform, Type}; @@ -273,6 +274,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), + CompressionCodec::None, ) .build_v2_data(); for entry in &entries { @@ -575,6 +577,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), + CompressionCodec::None, ) .build_v2_data(); for entry in &entries { @@ -672,6 +675,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), + CompressionCodec::None, ) .build_v1(); for entry in &entries { @@ -781,6 +785,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), + CompressionCodec::None, ) .build_v1(); for entry in &entries { @@ -889,6 +894,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), + CompressionCodec::None, ) .build_v2_data(); for entry in &entries { @@ -1168,6 +1174,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), + CompressionCodec::None, ) .build_v2_data(); for entry in &entries { diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index 88da1a396c..c635f44d88 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -28,14 +28,14 @@ use super::{ Datum, FormatVersion, ManifestContentType, PartitionSpec, PrimitiveType, UNASSIGNED_SEQUENCE_NUMBER, }; -use crate::encryption::EncryptedOutputFile; +use crate::compression::CompressionCodec; use crate::error::Result; use crate::io::{FileWrite, OutputFile}; use crate::spec::manifest::_serde::{ManifestEntryV1, ManifestEntryV2}; use crate::spec::manifest::{manifest_schema_v1, manifest_schema_v2}; use crate::spec::{ DataContentType, DataFile, FieldSummary, ManifestEntry, ManifestFile, ManifestMetadata, - ManifestStatus, PrimitiveLiteral, SchemaRef, StructType, + ManifestStatus, PrimitiveLiteral, SchemaRef, StructType, avro_util, }; use crate::{Error, ErrorKind}; @@ -43,6 +43,9 @@ use crate::{Error, ErrorKind}; /// with the actual snapshot ID before it is committed. const UNASSIGNED_SNAPSHOT_ID: i64 = -1; +// The future is pinned eagerly (rather than holding OutputFile) because OutputFile is consumed +// by `output.writer().await` — we can't store it and call writer() later. Capturing the future +// at construction time lets us defer the async work while still owning the location string. type WriterFuture = Pin>> + Send>>; /// The builder used to create a [`ManifestWriter`]. @@ -53,6 +56,7 @@ pub struct ManifestWriterBuilder { key_metadata: Option>, schema: SchemaRef, partition_spec: PartitionSpec, + compression: CompressionCodec, } impl ManifestWriterBuilder { @@ -63,6 +67,7 @@ impl ManifestWriterBuilder { key_metadata: Option>, schema: SchemaRef, partition_spec: PartitionSpec, + compression: CompressionCodec, ) -> Self { let location = output.location().to_owned(); Self { @@ -72,27 +77,7 @@ impl ManifestWriterBuilder { key_metadata, schema, partition_spec, - } - } - - /// Create a new builder from an [`EncryptedOutputFile`]. - /// - /// Use this when writing manifests with transparent encryption. - pub fn new_from_encrypted( - encrypted_output: EncryptedOutputFile, - snapshot_id: Option, - key_metadata: Option>, - schema: SchemaRef, - partition_spec: PartitionSpec, - ) -> Self { - let location = encrypted_output.location().to_owned(); - Self { - writer_future: Box::pin(async move { encrypted_output.writer().await }), - location, - snapshot_id, - key_metadata, - schema, - partition_spec, + compression, } } @@ -112,6 +97,7 @@ impl ManifestWriterBuilder { self.key_metadata, metadata, None, + self.compression, ) } @@ -131,6 +117,7 @@ impl ManifestWriterBuilder { self.key_metadata, metadata, None, + self.compression, ) } @@ -150,6 +137,7 @@ impl ManifestWriterBuilder { self.key_metadata, metadata, None, + self.compression, ) } @@ -171,6 +159,7 @@ impl ManifestWriterBuilder { // First row id is assigned by the [`ManifestListWriter`] when the manifest // is added to the list. None, + self.compression, ) } @@ -190,6 +179,7 @@ impl ManifestWriterBuilder { self.key_metadata, metadata, None, + self.compression, ) } } @@ -216,6 +206,8 @@ pub struct ManifestWriter { manifest_entries: Vec, metadata: ManifestMetadata, + + compression: CompressionCodec, } impl ManifestWriter { @@ -227,6 +219,7 @@ impl ManifestWriter { key_metadata: Option>, metadata: ManifestMetadata, first_row_id: Option, + compression: CompressionCodec, ) -> Self { Self { writer_future, @@ -243,6 +236,7 @@ impl ManifestWriter { key_metadata, manifest_entries: Vec::new(), metadata, + compression, } } @@ -451,7 +445,12 @@ impl ManifestWriter { // Manifest schema did not change between V2 and V3 FormatVersion::V2 | FormatVersion::V3 => manifest_schema_v2(&partition_type)?, }; - let mut avro_writer = AvroWriter::new(&avro_schema, Vec::new()); + + let mut avro_writer = AvroWriter::with_codec( + &avro_schema, + Vec::new(), + avro_util::to_avro_codec(self.compression)?, + ); avro_writer.add_user_metadata( "schema".to_string(), to_vec(table_schema).map_err(|err| { @@ -602,8 +601,13 @@ mod tests { use tempfile::TempDir; use super::*; + use crate::compression::CompressionCodec; use crate::io::FileIO; - use crate::spec::{DataFileFormat, Manifest, NestedField, PrimitiveType, Schema, Struct, Type}; + use crate::spec::{ + DataContentType, DataFileBuilder, DataFileFormat, Manifest, ManifestContentType, + ManifestEntry, ManifestMetadata, ManifestStatus, NestedField, PartitionSpec, PrimitiveType, + Schema, Struct, Type, + }; #[tokio::test] async fn test_add_delete_existing() { @@ -735,6 +739,7 @@ mod tests { None, metadata.schema.clone(), metadata.partition_spec.clone(), + CompressionCodec::None, ) .build_v2_data(); writer.add_entry(entries[0].clone()).unwrap(); @@ -755,6 +760,91 @@ mod tests { assert_eq!(actual_manifest, Manifest::new(metadata, entries)); } + #[tokio::test] + async fn test_manifest_writer_with_compression() { + let metadata = { + let schema = Schema::builder() + .with_fields(vec![Arc::new(NestedField::required( + 1, + "id", + Type::Primitive(PrimitiveType::Int), + ))]) + .build() + .unwrap(); + + ManifestMetadata { + schema_id: 0, + schema: Arc::new(schema), + partition_spec: PartitionSpec::unpartition_spec(), + format_version: FormatVersion::V2, + content: ManifestContentType::Data, + } + }; + + async fn write_manifest( + io: &FileIO, + path: &std::path::Path, + metadata: &ManifestMetadata, + compression: CompressionCodec, + ) { + let output_file = io.new_output(path.to_str().unwrap()).unwrap(); + let mut writer = ManifestWriterBuilder::new( + output_file, + Some(1), + None, + metadata.schema.clone(), + metadata.partition_spec.clone(), + compression, + ) + .build_v2_data(); + for i in 0..1000 { + let data_file = DataFileBuilder::default() + .content(DataContentType::Data) + .file_path(format!( + "/very/long/path/to/data/directory/with/many/subdirectories/file_{i}.parquet" + )) + .file_format(DataFileFormat::Parquet) + .partition(Struct::empty()) + .file_size_in_bytes(100000 + i) + .record_count(1000 + i) + .build() + .unwrap(); + let entry = ManifestEntry::builder() + .status(ManifestStatus::Added) + .snapshot_id(1) + .sequence_number(1) + .file_sequence_number(1) + .data_file(data_file) + .build(); + writer.add_entry(entry).unwrap(); + } + writer.write_manifest_file().await.unwrap(); + } + + let tmp_dir = TempDir::new().unwrap(); + let io = FileIO::new_with_fs(); + let uncompressed_path = tmp_dir.path().join("uncompressed_manifest.avro"); + let compressed_path = tmp_dir.path().join("compressed_manifest.avro"); + + write_manifest(&io, &uncompressed_path, &metadata, CompressionCodec::None).await; + write_manifest(&io, &compressed_path, &metadata, CompressionCodec::Gzip(Some(9))).await; + + let uncompressed_size = fs::metadata(&uncompressed_path).unwrap().len(); + let compressed_size = fs::metadata(&compressed_path).unwrap().len(); + + // Verify compression is actually working + assert!( + compressed_size < uncompressed_size, + "Compressed size ({compressed_size}) should be less than uncompressed size ({uncompressed_size})" + ); + + // Verify the compressed file can be read back correctly + let compressed_bytes = fs::read(&compressed_path).unwrap(); + let manifest = Manifest::parse_avro(&compressed_bytes).unwrap(); + assert_eq!(manifest.metadata.format_version, FormatVersion::V2); + assert_eq!(manifest.entries.len(), 1000); + } + #[tokio::test] async fn test_v3_delete_manifest_delete_file_roundtrip() { let schema = Arc::new( @@ -823,6 +913,7 @@ mod tests { None, schema.clone(), partition_spec.clone(), + CompressionCodec::None, ) .build_v3_deletes(); @@ -843,4 +934,5 @@ mod tests { ManifestContentType::Deletes, ); } + } diff --git a/crates/iceberg/src/spec/manifest_list/mod.rs b/crates/iceberg/src/spec/manifest_list/mod.rs index abdbd41fbc..82f2b8bdbc 100644 --- a/crates/iceberg/src/spec/manifest_list/mod.rs +++ b/crates/iceberg/src/spec/manifest_list/mod.rs @@ -99,6 +99,7 @@ mod test { use super::_const_schema::MANIFEST_LIST_AVRO_SCHEMA_V2; use super::_serde::ManifestFileV2; use super::*; + use crate::compression::CompressionCodec; use crate::io::FileIO; use crate::spec::{Datum, FieldSummary, ManifestContentType, ManifestFile}; @@ -134,15 +135,12 @@ mod test { let full_path = format!("{}/{}", tmp_dir.path().to_str().unwrap(), file_name); let mut writer = ManifestListWriter::v1( - file_io - .new_output(full_path.clone()) - .unwrap() - .writer() - .await - .unwrap(), + file_io.new_output(full_path.clone()).unwrap(), 1646658105718557341, Some(1646658105718557341), - ); + CompressionCodec::None, + ) + .unwrap(); writer .add_manifests(manifest_list.entries.clone().into_iter()) @@ -211,16 +209,13 @@ mod test { let full_path = format!("{}/{}", tmp_dir.path().to_str().unwrap(), file_name); let mut writer = ManifestListWriter::v2( - file_io - .new_output(full_path.clone()) - .unwrap() - .writer() - .await - .unwrap(), + file_io.new_output(full_path.clone()).unwrap(), 1646658105718557341, Some(1646658105718557341), 1, - ); + CompressionCodec::None, + ) + .unwrap(); writer .add_manifests(manifest_list.entries.clone().into_iter()) @@ -329,17 +324,14 @@ mod test { let full_path = format!("{}/{}", tmp_dir.path().to_str().unwrap(), file_name); let mut writer = ManifestListWriter::v3( - file_io - .new_output(full_path.clone()) - .unwrap() - .writer() - .await - .unwrap(), + file_io.new_output(full_path.clone()).unwrap(), 377075049360453639, Some(377075049360453639), 1, Some(10), - ); + CompressionCodec::None, + ) + .unwrap(); writer .add_manifests(manifest_list.entries.clone().into_iter()) diff --git a/crates/iceberg/src/spec/manifest_list/writer.rs b/crates/iceberg/src/spec/manifest_list/writer.rs index f739870171..a920a83a75 100644 --- a/crates/iceberg/src/spec/manifest_list/writer.rs +++ b/crates/iceberg/src/spec/manifest_list/writer.rs @@ -25,14 +25,16 @@ use super::_const_schema::{ }; use super::_serde::{ManifestFileV1, ManifestFileV2, ManifestFileV3}; use super::{FormatVersion, ManifestContentType, ManifestFile, UNASSIGNED_SEQUENCE_NUMBER}; +use crate::compression::CompressionCodec; use crate::error::Result; -use crate::io::FileWrite; +use crate::io::OutputFile; +use crate::spec::avro_util::to_avro_codec; use crate::{Error, ErrorKind}; /// A manifest list writer. pub struct ManifestListWriter { format_version: FormatVersion, - writer: Box, + output_file: OutputFile, avro_writer: Writer<'static, Vec>, sequence_number: i64, snapshot_id: i64, @@ -43,6 +45,7 @@ impl std::fmt::Debug for ManifestListWriter { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("ManifestListWriter") .field("format_version", &self.format_version) + .field("output_file", &self.output_file) .field("avro_writer", &self.avro_writer.schema()) .finish_non_exhaustive() } @@ -54,12 +57,13 @@ impl ManifestListWriter { self.next_row_id } - /// Construct a v1 [`ManifestListWriter`] that writes to a provided [`FileWrite`]. + /// Construct a v1 [`ManifestListWriter`] that writes to a provided [`OutputFile`]. pub fn v1( - writer: Box, + output_file: OutputFile, snapshot_id: i64, parent_snapshot_id: Option, - ) -> Self { + compression: CompressionCodec, + ) -> Result { let mut metadata = HashMap::from_iter([ ("snapshot-id".to_string(), snapshot_id.to_string()), ("format-version".to_string(), "1".to_string()), @@ -70,16 +74,25 @@ impl ManifestListWriter { parent_snapshot_id.to_string(), ); } - Self::new(FormatVersion::V1, writer, metadata, 0, snapshot_id, None) + Self::new( + FormatVersion::V1, + output_file, + metadata, + 0, + snapshot_id, + None, + compression, + ) } - /// Construct a v2 [`ManifestListWriter`] that writes to a provided [`FileWrite`]. + /// Construct a v2 [`ManifestListWriter`] that writes to a provided [`OutputFile`]. pub fn v2( - writer: Box, + output_file: OutputFile, snapshot_id: i64, parent_snapshot_id: Option, sequence_number: i64, - ) -> Self { + compression: CompressionCodec, + ) -> Result { let mut metadata = HashMap::from_iter([ ("snapshot-id".to_string(), snapshot_id.to_string()), ("sequence-number".to_string(), sequence_number.to_string()), @@ -93,22 +106,24 @@ impl ManifestListWriter { ); Self::new( FormatVersion::V2, - writer, + output_file, metadata, sequence_number, snapshot_id, None, + compression, ) } - /// Construct a v3 [`ManifestListWriter`] that writes to a provided [`FileWrite`]. + /// Construct a v3 [`ManifestListWriter`] that writes to a provided [`OutputFile`]. pub fn v3( - writer: Box, + output_file: OutputFile, snapshot_id: i64, parent_snapshot_id: Option, sequence_number: i64, first_row_id: Option, // Always None for delete manifests - ) -> Self { + compression: CompressionCodec, + ) -> Result { let mut metadata = HashMap::from_iter([ ("snapshot-id".to_string(), snapshot_id.to_string()), ("sequence-number".to_string(), sequence_number.to_string()), @@ -128,41 +143,44 @@ impl ManifestListWriter { ); Self::new( FormatVersion::V3, - writer, + output_file, metadata, sequence_number, snapshot_id, first_row_id, + compression, ) } fn new( format_version: FormatVersion, - writer: Box, + output_file: OutputFile, metadata: HashMap, sequence_number: i64, snapshot_id: i64, first_row_id: Option, - ) -> Self { + compression: CompressionCodec, + ) -> Result { + let codec = to_avro_codec(compression)?; let avro_schema = match format_version { FormatVersion::V1 => &MANIFEST_LIST_AVRO_SCHEMA_V1, FormatVersion::V2 => &MANIFEST_LIST_AVRO_SCHEMA_V2, FormatVersion::V3 => &MANIFEST_LIST_AVRO_SCHEMA_V3, }; - let mut avro_writer = Writer::new(avro_schema, Vec::new()); + let mut avro_writer = Writer::with_codec(avro_schema, Vec::new(), codec); for (key, value) in metadata { avro_writer .add_user_metadata(key, value) .expect("Avro metadata should be added to the writer before the first record."); } - Self { + Ok(Self { format_version, - writer, + output_file, avro_writer, sequence_number, snapshot_id, next_row_id: first_row_id, - } + }) } /// Append manifests to be written. @@ -197,10 +215,11 @@ impl ManifestListWriter { } /// Write the manifest list to the output file. - pub async fn close(mut self) -> Result<()> { + pub async fn close(self) -> Result<()> { let data = self.avro_writer.into_inner()?; - self.writer.write(Bytes::from(data)).await?; - self.writer.close().await?; + let mut writer = self.output_file.writer().await?; + writer.write(Bytes::from(data)).await?; + writer.close().await?; Ok(()) } @@ -235,7 +254,8 @@ impl ManifestListWriter { Ok(()) } - /// Returns number of newly assigned first-row-ids, if any. + /// Assigns `manifest.first_row_id` if not already set, advancing `self.next_row_id` + /// by the manifest's record count. fn assign_first_row_id(&mut self, manifest: &mut ManifestFile) -> Result<()> { match manifest.content { ManifestContentType::Data => { @@ -319,7 +339,8 @@ mod test { use tempfile::TempDir; use super::ManifestListWriter; - use crate::io::{FileIO, FileWrite}; + use crate::compression::CompressionCodec; + use crate::io::FileIO; use crate::spec::{ Datum, FieldSummary, ManifestContentType, ManifestFile, ManifestList, UNASSIGNED_SEQUENCE_NUMBER, @@ -353,9 +374,15 @@ mod test { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("manifest_list_v1.avro"); let io = FileIO::new_with_fs(); - let file_writer = file_writer(&path, io).await; + let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v1(file_writer, 1646658105718557341, Some(0)); + let mut writer = ManifestListWriter::v1( + output_file, + 1646658105718557341, + Some(0), + CompressionCodec::None, + ) + .unwrap(); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -400,9 +427,16 @@ mod test { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("manifest_list_v2.avro"); let io = FileIO::new_with_fs(); - let file_writer = file_writer(&path, io).await; + let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v2(file_writer, snapshot_id, Some(0), seq_num); + let mut writer = ManifestListWriter::v2( + output_file, + snapshot_id, + Some(0), + seq_num, + CompressionCodec::None, + ) + .unwrap(); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -448,10 +482,17 @@ mod test { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("manifest_list_v2.avro"); let io = FileIO::new_with_fs(); - let file_writer = file_writer(&path, io).await; + let output_file = output_file(&path, &io); - let mut writer = - ManifestListWriter::v3(file_writer, snapshot_id, Some(0), seq_num, Some(10)); + let mut writer = ManifestListWriter::v3( + output_file, + snapshot_id, + Some(0), + seq_num, + Some(10), + CompressionCodec::None, + ) + .unwrap(); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -496,9 +537,15 @@ mod test { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("manifest_list_v1.avro"); let io = FileIO::new_with_fs(); - let file_writer = file_writer(&path, io).await; + let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v1(file_writer, 1646658105718557341, Some(0)); + let mut writer = ManifestListWriter::v1( + output_file, + 1646658105718557341, + Some(0), + CompressionCodec::None, + ) + .unwrap(); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -541,9 +588,15 @@ mod test { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("manifest_list_v1.avro"); let io = FileIO::new_with_fs(); - let file_writer = file_writer(&path, io).await; + let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v1(file_writer, 1646658105718557341, Some(0)); + let mut writer = ManifestListWriter::v1( + output_file, + 1646658105718557341, + Some(0), + CompressionCodec::None, + ) + .unwrap(); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -588,9 +641,16 @@ mod test { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("manifest_list_v2.avro"); let io = FileIO::new_with_fs(); - let file_writer = file_writer(&path, io).await; + let output_file = output_file(&path, &io); - let mut writer = ManifestListWriter::v2(file_writer, snapshot_id, Some(0), seq_num); + let mut writer = ManifestListWriter::v2( + output_file, + snapshot_id, + Some(0), + seq_num, + CompressionCodec::None, + ) + .unwrap(); writer .add_manifests(expected_manifest_list.entries.clone().into_iter()) .unwrap(); @@ -607,11 +667,76 @@ mod test { temp_dir.close().unwrap(); } - async fn file_writer(path: &Path, io: FileIO) -> Box { - io.new_output(path.to_str().unwrap()) - .unwrap() - .writer() - .await - .unwrap() + #[tokio::test] + async fn test_manifest_list_writer_with_compression() { + let snapshot_id = 377075049360453639; + let seq_num = 1; + + let entries: Vec = (0..1000) + .map(|i| ManifestFile { + manifest_path: format!( + "s3a://icebergdata/demo/s1/t1/metadata/very-long-path-for-compression-test/manifest-file-number-{i}.avro" + ), + manifest_length: 6926 + i, + partition_spec_id: 1, + content: ManifestContentType::Data, + sequence_number: seq_num, + min_sequence_number: seq_num, + added_snapshot_id: snapshot_id, + added_files_count: Some(1), + existing_files_count: Some(0), + deleted_files_count: Some(0), + added_rows_count: Some(3), + existing_rows_count: Some(0), + deleted_rows_count: Some(0), + partitions: None, + key_metadata: None, + first_row_id: None, + }) + .collect(); + + let tmp_dir = TempDir::new().unwrap(); + let io = FileIO::new_with_fs(); + + let uncompressed_path = tmp_dir.path().join("manifest_list_uncompressed.avro"); + let mut writer = ManifestListWriter::v2( + output_file(&uncompressed_path, &io), + snapshot_id, + Some(0), + seq_num, + CompressionCodec::None, + ) + .unwrap(); + writer.add_manifests(entries.clone().into_iter()).unwrap(); + writer.close().await.unwrap(); + let uncompressed_size = fs::metadata(&uncompressed_path).unwrap().len(); + + let compressed_path = tmp_dir.path().join("manifest_list_compressed.avro"); + let mut writer = ManifestListWriter::v2( + output_file(&compressed_path, &io), + snapshot_id, + Some(0), + seq_num, + CompressionCodec::Gzip(Some(9)), + ) + .unwrap(); + writer.add_manifests(entries.into_iter()).unwrap(); + writer.close().await.unwrap(); + let compressed_size = fs::metadata(&compressed_path).unwrap().len(); + + assert!( + compressed_size < uncompressed_size, + "compressed size ({compressed_size}) should be less than uncompressed size ({uncompressed_size})" + ); + + let compressed_bytes = fs::read(&compressed_path).unwrap(); + let manifest_list = + ManifestList::parse_with_version(&compressed_bytes, crate::spec::FormatVersion::V2) + .unwrap(); + assert_eq!(manifest_list.entries().len(), 1000); + } + + fn output_file(path: &Path, io: &FileIO) -> crate::io::OutputFile { + io.new_output(path.to_str().unwrap()).unwrap() } } diff --git a/crates/iceberg/src/spec/mod.rs b/crates/iceberg/src/spec/mod.rs index b23ca1eda0..0b3f3124e9 100644 --- a/crates/iceberg/src/spec/mod.rs +++ b/crates/iceberg/src/spec/mod.rs @@ -17,6 +17,7 @@ //! Spec for Iceberg. +pub(crate) mod avro_util; mod datatypes; mod encrypted_key; mod manifest; diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index 607fd98350..5c68c28443 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -460,7 +460,7 @@ impl TableMetadata { && metadata_content[0] == 0x1F && metadata_content[1] == 0x8B { - let decompressed_data = CompressionCodec::gzip_default() + let decompressed_data = CompressionCodec::Gzip(None) .decompress(metadata_content.to_vec()) .map_err(|e| { Error::new( @@ -3633,7 +3633,7 @@ mod tests { let original_metadata: TableMetadata = get_test_table_metadata("TableMetadataV2Valid.json"); let json = serde_json::to_string(&original_metadata).unwrap(); - let compressed = CompressionCodec::gzip_default() + let compressed = CompressionCodec::Gzip(None) .compress(json.into_bytes()) .expect("failed to compress metadata"); std::fs::write(&metadata_location, &compressed).expect("failed to write metadata"); diff --git a/crates/iceberg/src/spec/table_properties.rs b/crates/iceberg/src/spec/table_properties.rs index 444c9ac4ed..73b47dc4e5 100644 --- a/crates/iceberg/src/spec/table_properties.rs +++ b/crates/iceberg/src/spec/table_properties.rs @@ -21,6 +21,7 @@ use std::str::FromStr; use crate::compression::CompressionCodec; use crate::error::{Error, ErrorKind, Result}; +use crate::spec::avro_util; fn parse_property( properties: &HashMap, @@ -76,9 +77,7 @@ pub(crate) fn parse_metadata_file_compression( Error::new( ErrorKind::DataInvalid, format!( - "Invalid metadata compression codec: {value}. Only '{}' and '{}' are supported.", - CompressionCodec::None.name(), - CompressionCodec::gzip_default().name() + "Invalid metadata compression codec: {value}. Only 'none' and 'gzip' are supported." ), ) })?; @@ -89,9 +88,7 @@ pub(crate) fn parse_metadata_file_compression( _ => Err(Error::new( ErrorKind::DataInvalid, format!( - "Invalid metadata compression codec: {value}. Only '{}' and '{}' are supported for metadata files.", - CompressionCodec::None.name(), - CompressionCodec::gzip_default().name() + "Invalid metadata compression codec: {value}. Only 'none' and 'gzip' are supported for metadata files." ), )), } @@ -112,6 +109,8 @@ pub struct TableProperties { pub write_format_default: String, /// The target file size for files. pub write_target_file_size_bytes: usize, + /// Compression codec for Avro files (manifests, manifest lists) + pub avro_compression_codec: CompressionCodec, /// Compression codec for metadata files (JSON) pub metadata_compression_codec: CompressionCodec, /// Whether to use `FanoutWriter` for partitioned tables. @@ -228,10 +227,19 @@ impl TableProperties { /// Default target file size pub const PROPERTY_WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT: usize = 512 * 1024 * 1024; // 512 MB - /// Compression codec for metadata files (JSON) + /// Compression codec for metadata files (JSON). pub const PROPERTY_METADATA_COMPRESSION_CODEC: &str = "write.metadata.compression-codec"; - /// Default metadata compression codec - uncompressed + /// Default metadata compression codec pub const PROPERTY_METADATA_COMPRESSION_CODEC_DEFAULT: &str = "none"; + + /// Compression codec for Avro files (manifests, manifest lists) + pub const PROPERTY_AVRO_COMPRESSION_CODEC: &str = "write.avro.compression-codec"; + /// Default Avro compression codec - gzip + pub const PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT: &str = "gzip"; + + /// Compression level for Avro files + pub const PROPERTY_AVRO_COMPRESSION_LEVEL: &str = "write.avro.compression-level"; + /// Whether to use `FanoutWriter` for partitioned tables (handles unsorted data). /// If false, uses `ClusteredWriter` (requires sorted data, more memory efficient). pub const PROPERTY_DATAFUSION_WRITE_FANOUT_ENABLED: &str = "write.datafusion.fanout.enabled"; @@ -324,6 +332,30 @@ impl TryFrom<&HashMap> for TableProperties { TableProperties::PROPERTY_WRITE_TARGET_FILE_SIZE_BYTES, TableProperties::PROPERTY_WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT, )?, + avro_compression_codec: { + let codec_name = parse_property( + props, + TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC, + TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC_DEFAULT.to_string(), + )?; + + let level = props + .get(TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL) + .map(|v| { + v.parse::().map_err(|e| { + Error::new( + ErrorKind::DataInvalid, + format!( + "Invalid value for {}: {e}", + TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL + ), + ) + }) + }) + .transpose()?; + + avro_util::parse_avro_codec(Some(&codec_name), level)? + }, metadata_compression_codec: parse_metadata_file_compression(props)?, write_datafusion_fanout_enabled: parse_property( props, @@ -411,6 +443,11 @@ mod tests { table_properties.write_target_file_size_bytes, TableProperties::PROPERTY_WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT ); + // Test compression defaults - should be gzip with no level specified + assert_eq!( + table_properties.avro_compression_codec, + CompressionCodec::Gzip(None) + ); // Test compression defaults (none means CompressionCodec::None) assert_eq!( table_properties.metadata_compression_codec, @@ -456,6 +493,26 @@ mod tests { assert_eq!(table_properties.max_ref_age_ms, 5678); } + #[test] + fn test_table_properties_avro_compression() { + let props = HashMap::from([ + ( + TableProperties::PROPERTY_AVRO_COMPRESSION_CODEC.to_string(), + "zstd".to_string(), + ), + ( + TableProperties::PROPERTY_AVRO_COMPRESSION_LEVEL.to_string(), + "3".to_string(), + ), + ]); + let table_properties = TableProperties::try_from(&props).unwrap(); + // Check that it parsed to a Zstd codec with level 3 + assert_eq!( + table_properties.avro_compression_codec, + CompressionCodec::Zstd(Some(3)) + ); + } + #[test] fn test_table_properties_compression() { let props = HashMap::from([( @@ -465,7 +522,7 @@ mod tests { let table_properties = TableProperties::try_from(&props).unwrap(); assert_eq!( table_properties.metadata_compression_codec, - CompressionCodec::gzip_default() + CompressionCodec::Gzip(None) ); } @@ -492,7 +549,7 @@ mod tests { let table_properties = TableProperties::try_from(&props_upper).unwrap(); assert_eq!( table_properties.metadata_compression_codec, - CompressionCodec::gzip_default() + CompressionCodec::Gzip(None) ); // Test mixed case @@ -503,7 +560,7 @@ mod tests { let table_properties = TableProperties::try_from(&props_mixed).unwrap(); assert_eq!( table_properties.metadata_compression_codec, - CompressionCodec::gzip_default() + CompressionCodec::Gzip(None) ); // Test "NONE" should also be case-insensitive @@ -658,7 +715,7 @@ mod tests { )]); assert_eq!( parse_metadata_file_compression(&props).unwrap(), - CompressionCodec::gzip_default() + CompressionCodec::Gzip(None) ); // Test case insensitivity - "NONE" @@ -678,7 +735,7 @@ mod tests { )]); assert_eq!( parse_metadata_file_compression(&props).unwrap(), - CompressionCodec::gzip_default() + CompressionCodec::Gzip(None) ); // Test case insensitivity - "GzIp" @@ -688,7 +745,7 @@ mod tests { )]); assert_eq!( parse_metadata_file_compression(&props).unwrap(), - CompressionCodec::gzip_default() + CompressionCodec::Gzip(None) ); // Test default when property is missing diff --git a/crates/iceberg/src/transaction/append.rs b/crates/iceberg/src/transaction/append.rs index 36fde117ab..051d252b92 100644 --- a/crates/iceberg/src/transaction/append.rs +++ b/crates/iceberg/src/transaction/append.rs @@ -158,6 +158,7 @@ mod tests { use tempfile::TempDir; use uuid::Uuid; + use crate::compression::CompressionCodec; use crate::io::FileIO; use crate::spec::{ DataContentType, DataFileBuilder, DataFileFormat, Literal, MAIN_BRANCH, ManifestEntry, @@ -236,6 +237,7 @@ mod tests { None, schema.clone(), partition_spec.as_ref().clone(), + CompressionCodec::None, ) .build_v2_data(); data_writer @@ -266,6 +268,7 @@ mod tests { None, schema.clone(), partition_spec.as_ref().clone(), + CompressionCodec::None, ) .build_v2_data(); delete_writer @@ -301,14 +304,13 @@ mod tests { table .file_io() .new_output(current_snapshot.manifest_list()) - .unwrap() - .writer() - .await .unwrap(), current_snapshot.snapshot_id(), current_snapshot.parent_snapshot_id(), current_snapshot.sequence_number(), - ); + CompressionCodec::None, + ) + .unwrap(); manifest_list_write .add_manifests(vec![data_manifest, delete_manifest].into_iter()) .unwrap(); diff --git a/crates/iceberg/src/transaction/snapshot.rs b/crates/iceberg/src/transaction/snapshot.rs index 8e47226072..b5163600cc 100644 --- a/crates/iceberg/src/transaction/snapshot.rs +++ b/crates/iceberg/src/transaction/snapshot.rs @@ -23,6 +23,7 @@ use futures::TryStreamExt; use futures::stream::FuturesUnordered; use uuid::Uuid; +use crate::compression::CompressionCodec; use crate::error::Result; use crate::spec::{ DataFile, DataFileFormat, FormatVersion, MAIN_BRANCH, ManifestContentType, ManifestEntry, @@ -165,6 +166,18 @@ impl<'a> SnapshotProducer<'a> { Ok(()) } + fn avro_compression_codec(&self) -> Result { + TableProperties::try_from(self.table.metadata().properties()) + .map(|p| p.avro_compression_codec) + .map_err(|e| { + Error::new( + ErrorKind::DataInvalid, + "Failed to parse table properties for compression settings", + ) + .with_source(e) + }) + } + pub(crate) async fn validate_duplicate_files(&self) -> Result<()> { let Some(current_snapshot) = self.table.metadata().current_snapshot() else { return Ok(()); @@ -262,6 +275,7 @@ impl<'a> SnapshotProducer<'a> { .default_partition_spec() .as_ref() .clone(), + self.avro_compression_codec()?, ); match self.table.metadata().format_version() { FormatVersion::V1 => Ok(builder.build_v1()), @@ -440,31 +454,34 @@ impl<'a> SnapshotProducer<'a> { let manifest_list_path = self.generate_manifest_list_file_path(0); let next_seq_num = self.table.metadata().next_sequence_number(); let first_row_id = self.table.metadata().next_row_id(); - let writer = self + let output_file = self .table .file_io() - .new_output(manifest_list_path.clone())? - .writer() - .await?; + .new_output(manifest_list_path.clone())?; + let compression = self.avro_compression_codec()?; + let mut manifest_list_writer = match self.table.metadata().format_version() { FormatVersion::V1 => ManifestListWriter::v1( - writer, + output_file, self.snapshot_id, self.table.metadata().current_snapshot_id(), - ), + compression, + )?, FormatVersion::V2 => ManifestListWriter::v2( - writer, + output_file, self.snapshot_id, self.table.metadata().current_snapshot_id(), next_seq_num, - ), + compression, + )?, FormatVersion::V3 => ManifestListWriter::v3( - writer, + output_file, self.snapshot_id, self.table.metadata().current_snapshot_id(), next_seq_num, Some(first_row_id), - ), + compression, + )?, }; // Calling self.summary() before self.manifest_file() is important because self.added_data_files