From 21788bf3335955dde85ae9341eb8e538abe00292 Mon Sep 17 00:00:00 2001 From: Alon Agmon Date: Wed, 8 May 2024 08:35:00 +0300 Subject: [PATCH 1/4] added serde aliases to support both forms conventions --- crates/iceberg/src/spec/manifest_list.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/crates/iceberg/src/spec/manifest_list.rs b/crates/iceberg/src/spec/manifest_list.rs index da63815d6..676a8dfb0 100644 --- a/crates/iceberg/src/spec/manifest_list.rs +++ b/crates/iceberg/src/spec/manifest_list.rs @@ -68,7 +68,8 @@ impl ManifestList { from_value::<_serde::ManifestListV1>(&values)?.try_into(partition_type_provider) } FormatVersion::V2 => { - let reader = Reader::with_schema(&MANIFEST_LIST_AVRO_SCHEMA_V2, bs)?; + //let reader = Reader::with_schema(&MANIFEST_LIST_AVRO_SCHEMA_V2, bs)?; + let reader = Reader::new(bs)?; let values = Value::Array(reader.collect::, _>>()?); from_value::<_serde::ManifestListV2>(&values)?.try_into(partition_type_provider) } @@ -802,6 +803,9 @@ pub(super) mod _serde { pub key_metadata: Option, } + // Aliases were added to fields that were renemaed in Iceberg 1.5.0 (https://github.com/apache/iceberg/pull/5338), in order to support both conventions/versions. + // In the current implementation deserialization is done using field names, and therefore these fields may appear as either. + // see issue that raised this here: https://github.com/apache/iceberg-rust/issues/338 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] pub(super) struct ManifestFileV2 { pub manifest_path: String, @@ -811,8 +815,11 @@ pub(super) mod _serde { pub sequence_number: i64, pub min_sequence_number: i64, pub added_snapshot_id: i64, + #[serde(alias = "added_data_files_count", alias = "added_files_count")] pub added_data_files_count: i32, + #[serde(alias = "existing_data_files_count", alias = "existing_files_count")] pub existing_data_files_count: i32, + #[serde(alias = "deleted_data_files_count", alias = "deleted_files_count")] pub deleted_data_files_count: i32, pub added_rows_count: i64, pub existing_rows_count: i64, From f10709aededcbc91b2c8845077272e6c8ae33c80 Mon Sep 17 00:00:00 2001 From: Alon Agmon Date: Wed, 8 May 2024 08:43:06 +0300 Subject: [PATCH 2/4] reading manifests without avro schema --- crates/iceberg/src/spec/manifest_list.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/iceberg/src/spec/manifest_list.rs b/crates/iceberg/src/spec/manifest_list.rs index 676a8dfb0..57b896de1 100644 --- a/crates/iceberg/src/spec/manifest_list.rs +++ b/crates/iceberg/src/spec/manifest_list.rs @@ -68,7 +68,6 @@ impl ManifestList { from_value::<_serde::ManifestListV1>(&values)?.try_into(partition_type_provider) } FormatVersion::V2 => { - //let reader = Reader::with_schema(&MANIFEST_LIST_AVRO_SCHEMA_V2, bs)?; let reader = Reader::new(bs)?; let values = Value::Array(reader.collect::, _>>()?); from_value::<_serde::ManifestListV2>(&values)?.try_into(partition_type_provider) From a7c51125d5eb9bb0779f2125e55dcbda34d49625 Mon Sep 17 00:00:00 2001 From: Alon Agmon Date: Thu, 9 May 2024 01:10:10 +0300 Subject: [PATCH 3/4] adding avro files of both versions and add a test to deser both --- crates/iceberg/src/spec/manifest_list.rs | 54 ++++++++++++++++-- .../manifests_lists/manifest-list-v2-1.avro | Bin 0 -> 4247 bytes .../manifests_lists/manifest-list-v2-2.avro | Bin 0 -> 4218 bytes 3 files changed, 50 insertions(+), 4 deletions(-) create mode 100644 crates/iceberg/testdata/manifests_lists/manifest-list-v2-1.avro create mode 100644 crates/iceberg/testdata/manifests_lists/manifest-list-v2-2.avro diff --git a/crates/iceberg/src/spec/manifest_list.rs b/crates/iceberg/src/spec/manifest_list.rs index 57b896de1..a4b719eac 100644 --- a/crates/iceberg/src/spec/manifest_list.rs +++ b/crates/iceberg/src/spec/manifest_list.rs @@ -1095,16 +1095,16 @@ pub(super) mod _serde { #[cfg(test)] mod test { + use apache_avro::{Reader, Schema}; use std::{collections::HashMap, fs, sync::Arc}; - use tempfile::TempDir; use crate::{ io::FileIOBuilder, spec::{ - manifest_list::{_serde::ManifestListV1, UNASSIGNED_SEQUENCE_NUMBER}, - FieldSummary, Literal, ManifestContentType, ManifestFile, ManifestList, - ManifestListWriter, NestedField, PrimitiveType, StructType, Type, + manifest_list::_serde::ManifestListV1, FieldSummary, Literal, ManifestContentType, + ManifestFile, ManifestList, ManifestListWriter, NestedField, PrimitiveType, StructType, + Type, UNASSIGNED_SEQUENCE_NUMBER, }, }; @@ -1468,4 +1468,50 @@ mod test { temp_dir.close().unwrap(); } + + #[tokio::test] + async fn test_manifest_list_v2_deserializer_aliases() { + // reading avro manifest file generated by iceberg 1.4.0 + let avro_1_path = "testdata/manifests_lists/manifest-list-v2-1.avro"; + let bs_1 = fs::read(avro_1_path).unwrap(); + let avro_1_fields = read_avro_schema_fields_as_str(bs_1.clone()).await; + assert_eq!( + avro_1_fields, + "manifest_path, manifest_length, partition_spec_id, content, sequence_number, min_sequence_number, added_snapshot_id, added_data_files_count, existing_data_files_count, deleted_data_files_count, added_rows_count, existing_rows_count, deleted_rows_count, partitions" + ); + // reading avro manifest file generated by iceberg 1.5.0 + let avro_2_path = "testdata/manifests_lists/manifest-list-v2-2.avro"; + let bs_2 = fs::read(avro_2_path).unwrap(); + let avro_2_fields = read_avro_schema_fields_as_str(bs_2.clone()).await; + assert_eq!( + avro_2_fields, + "manifest_path, manifest_length, partition_spec_id, content, sequence_number, min_sequence_number, added_snapshot_id, added_files_count, existing_files_count, deleted_files_count, added_rows_count, existing_rows_count, deleted_rows_count, partitions" + ); + // deserializing both files to ManifestList struct + let _manifest_list_1 = + ManifestList::parse_with_version(&bs_1, crate::spec::FormatVersion::V2, move |_id| { + Ok(Some(StructType::new(vec![]))) + }) + .unwrap(); + let _manifest_list_2 = + ManifestList::parse_with_version(&bs_2, crate::spec::FormatVersion::V2, move |_id| { + Ok(Some(StructType::new(vec![]))) + }) + .unwrap(); + } + + async fn read_avro_schema_fields_as_str(bs: Vec) -> String { + let reader = Reader::new(&bs[..]).unwrap(); + let schema = reader.writer_schema(); + let fields: String = match schema { + Schema::Record(record) => record + .fields + .iter() + .map(|field| field.name.clone()) + .collect::>() + .join(", "), + _ => "".to_string(), + }; + fields + } } diff --git a/crates/iceberg/testdata/manifests_lists/manifest-list-v2-1.avro b/crates/iceberg/testdata/manifests_lists/manifest-list-v2-1.avro new file mode 100644 index 0000000000000000000000000000000000000000..5c5cdb1ad853b4723f930058ba3b89e5f108328d GIT binary patch literal 4247 zcmbW4&u<$=6vs(~1gDAv>M1ItRn=1hyH4ylmU=)fNR=vq>QY6Cs%5-8&ThOv%W zr?bCw=`ubD{MSRnA}&6=@<5@(K2cgqKny=Hl~u*VF8l6c&+d|t>RsC*@VILe#|)L$ z?t{XLKBg9|NCSm5u)RI_Zu$nqx$PU6+P;V0yK@T-Y-*vm+bE{27yf{^Q}o?4d0_Aem|{(9PCE&V~G4Nq833}X zxw0q4V9UyCp~-iE*J5T+UWUrJa__P#%nX|qpfal5IW7cDEs{&St9X&)D4;^=CB{L3 zhwwApeHWyUKcCdfpsqHvBc78;CJWg1Lb$Uma2B`Y`wqb#JV-S1ZV-WZbrJT4iJDpL zj>27X9K`cP2iS=wHE2sZivkvh&hJUdnD=Kvu~))0Nf$>BRaz{kylyn56dX{4K<~ia z61tjIRY|0o^WV*s7{za)_$}1+1B7AQOw?Itrhw&n5%v2Q0LO1F4%QCuAFe8J+guQd zbjCCf`NrCj>tfIjwiSX6OO$i!e$-6}`)WL#4c{cjCD^)yDY=|!Bap5&8ufK;V|{b0 zR^O^=@M$zxfa7Aic0hs<=Gt=YN+!l@$qZQ5R&0a7$kC_KkOxOWTlNN*`{D?2$0@(RUC|Z%i=YIt!CsvhyKp<)y!7rBN1S%l% zFfVro9H4D-fbax!gk8iV2w0uNvYrcfPyx*qti6C+5Yz^zmivTx$aDGz4prAVw9GNg zgKW>?AOvq?f>$!&E1>deDS8P(5Y6M0^I{M_PG!#`IHFzIB)u3W8?U7lOsR5x zF`FW}h?kj(d*L_t-jC=1z47s#f0zFJ?D=SA`PZeBpB^2Z9*=j%KkS}ukB`65cTUgs zhsXNEFVDX@*%>_=ZI8b9J$p3njE~QI@9&SF^g5lRC%q%C>O-cYpu#>E|!``XA4pzxdB5uN27t12gigOaK4? literal 0 HcmV?d00001 diff --git a/crates/iceberg/testdata/manifests_lists/manifest-list-v2-2.avro b/crates/iceberg/testdata/manifests_lists/manifest-list-v2-2.avro new file mode 100644 index 0000000000000000000000000000000000000000..00784ff1d6ab0c6ef8d11f71075a3f997440f1f9 GIT binary patch literal 4218 zcmbVPO^X~w7&hxAhme!(K|(AV$!S-*XLd5P0|CV-1Okc^W!WY6)^t~Ix7#0+?&=-K zups_|%{XAPyO=$Qm_>3D^tgiH!Hf6@ybFSO_TXFnQB^(NJs&wvPxbS>^?p3>)9>{^ zxpZ|EAB4dhkzo@Le}4U;LPrClbd``8L1-##ijO_^+rz$Nk%;P+;}UpmIm9(1rMvU6 zbYg(14JR_7h=z{82fxk00GzvlfvFSt=)({0prJ!;^llF&>>PF7F_mshRS(z7*e>z+ zWOO%>u6g5|@-B*V+|P%H;C@XUY~>_?m|j>*(sEsLPMMQ1yVG`5yGZP*v*rordiTj5|`&deiq`p8w{aI zP<}Y4g-XabouaxaQ6-`1k&g$F9neBSiWkdHu&PiS)ASBASkz{Tg>_1#8$n!Gskc~O z5EL>Tq5PPXBXvW9B%eAFg-%ZfWG3o6*~uBea`MwwFz4gmPI_imyyZlP!LSr-ABc6k4AO@V3iKpL{8(mCW8 z>&2djK{41>2sUhy&YAa7+W`AYvW<F9I>U^H$cu&Dh|49{V;4*$9GluL z{DI^_xN=KhVyTL5^3D~GAhjQYy`a^(HJIlNs_cl!Xs3cUF(QJsaA)pF=FwId62S_~ zD*Hk$dXHLC$A{g_7IE@9x`+u8WMV%R{l?N{^~R%MEd(yB7kG{|=otnBUVj}Lj`}1mS&-cft*7y5QO!N7^eQH}rmW`cV_u0w**;iX1 fP5ylJ^Tow4Km7gA`M1ADt52_xm4Dy7T%!IT?a!Mc literal 0 HcmV?d00001 From deecddd821d1bec7efcea7efa01525532467284f Mon Sep 17 00:00:00 2001 From: Alon Agmon Date: Thu, 9 May 2024 08:48:15 +0300 Subject: [PATCH 4/4] fixed typo --- crates/iceberg/src/spec/manifest_list.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iceberg/src/spec/manifest_list.rs b/crates/iceberg/src/spec/manifest_list.rs index a4b719eac..c390bee04 100644 --- a/crates/iceberg/src/spec/manifest_list.rs +++ b/crates/iceberg/src/spec/manifest_list.rs @@ -802,7 +802,7 @@ pub(super) mod _serde { pub key_metadata: Option, } - // Aliases were added to fields that were renemaed in Iceberg 1.5.0 (https://github.com/apache/iceberg/pull/5338), in order to support both conventions/versions. + // Aliases were added to fields that were renamed in Iceberg 1.5.0 (https://github.com/apache/iceberg/pull/5338), in order to support both conventions/versions. // In the current implementation deserialization is done using field names, and therefore these fields may appear as either. // see issue that raised this here: https://github.com/apache/iceberg-rust/issues/338 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]