diff --git a/parquet/src/arrow/async_reader/metadata.rs b/parquet/src/arrow/async_reader/metadata.rs index 311c32aeebf2..c0f4a6f62ffa 100644 --- a/parquet/src/arrow/async_reader/metadata.rs +++ b/parquet/src/arrow/async_reader/metadata.rs @@ -113,7 +113,8 @@ impl MetadataLoader { let mut footer = [0; FOOTER_SIZE]; footer.copy_from_slice(&suffix[suffix_len - FOOTER_SIZE..suffix_len]); - let length = ParquetMetaDataReader::decode_footer(&footer)?; + let footer = ParquetMetaDataReader::decode_footer_tail(&footer)?; + let length = footer.metadata_length(); if file_size < length + FOOTER_SIZE { return Err(ParquetError::EOF(format!( @@ -127,13 +128,26 @@ impl MetadataLoader { let (metadata, remainder) = if length > suffix_len - FOOTER_SIZE { let metadata_start = file_size - length - FOOTER_SIZE; let meta = fetch.fetch(metadata_start..file_size - FOOTER_SIZE).await?; - (ParquetMetaDataReader::decode_metadata(&meta, None)?, None) + ( + ParquetMetaDataReader::decode_metadata( + &meta, + footer.encrypted_footer(), + #[cfg(feature = "encryption")] + None, + )?, + None, + ) } else { let metadata_start = file_size - length - FOOTER_SIZE - footer_start; let slice = &suffix[metadata_start..suffix_len - FOOTER_SIZE]; ( - ParquetMetaDataReader::decode_metadata(slice, None)?, + ParquetMetaDataReader::decode_metadata( + slice, + footer.encrypted_footer(), + #[cfg(feature = "encryption")] + None, + )?, Some((footer_start, suffix.slice(..metadata_start))), ) }; diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index de0ecffdf440..a66d55b36f79 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -198,7 +198,8 @@ impl AsyncFileReader for T { let mut buf = [0_u8; FOOTER_SIZE]; self.read_exact(&mut buf).await?; - let metadata_len = ParquetMetaDataReader::decode_footer(&buf)?; + let footer = ParquetMetaDataReader::decode_footer_tail(&buf)?; + let metadata_len = footer.metadata_length(); self.seek(SeekFrom::End(-FOOTER_SIZE_I64 - metadata_len as i64)) .await?; @@ -207,7 +208,10 @@ impl AsyncFileReader for T { // todo: use file_decryption_properties Ok(Arc::new(ParquetMetaDataReader::decode_metadata( - &buf, None, + &buf, + footer.encrypted_footer(), + #[cfg(feature = "encryption")] + None, )?)) } .boxed() @@ -842,6 +846,7 @@ impl RowGroups for InMemoryRowGroup<'_> { self.metadata.column(i), self.row_count, page_locations, + #[cfg(feature = "encryption")] None, )?); diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index 21f909d505b2..bdab765cf700 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -59,14 +59,12 @@ pub fn parse_metadata(chunk_reader: &R) -> Result, -) -> Result { +pub fn decode_metadata(buf: &[u8]) -> Result { ParquetMetaDataReader::decode_metadata( buf, + false, #[cfg(feature = "encryption")] - file_decryption_properties, + None, ) } @@ -81,7 +79,10 @@ pub fn decode_metadata( /// | len | 'PAR1' | /// +-----+--------+ /// ``` -#[deprecated(since = "53.1.0", note = "Use ParquetMetaDataReader::decode_footer")] +#[deprecated( + since = "53.1.0", + note = "Use ParquetMetaDataReader::decode_footer_tail" +)] pub fn decode_footer(slice: &[u8; FOOTER_SIZE]) -> Result { - ParquetMetaDataReader::decode_footer(slice) + ParquetMetaDataReader::decode_footer_tail(slice).map(|f| f.metadata_length()) } diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index e080e50f181b..bec285aecd40 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -78,6 +78,26 @@ pub struct ParquetMetaDataReader { file_decryption_properties: Option, } +/// Describes how the footer metadata is stored +/// +/// This is parsed from the last 8 bytes of the Parquet file +pub struct FooterTail { + metadata_length: usize, + encrypted_footer: bool, +} + +impl FooterTail { + /// The length of the footer metadata in bytes + pub fn metadata_length(&self) -> usize { + self.metadata_length + } + + /// Whether the footer metadata is encrypted + pub fn encrypted_footer(&self) -> bool { + self.encrypted_footer + } +} + impl ParquetMetaDataReader { /// Create a new [`ParquetMetaDataReader`] pub fn new() -> Self { @@ -366,6 +386,7 @@ impl ParquetMetaDataReader { &mut fetch, file_size, self.get_prefetch_size(), + #[cfg(feature = "encryption")] self.file_decryption_properties.as_ref(), ) .await?; @@ -520,7 +541,8 @@ impl ParquetMetaDataReader { .get_read(file_size - 8)? .read_exact(&mut footer)?; - let metadata_len = Self::decode_footer(&footer)?; + let footer = Self::decode_footer_tail(&footer)?; + let metadata_len = footer.metadata_length(); let footer_metadata_len = FOOTER_SIZE + metadata_len; self.metadata_size = Some(footer_metadata_len); @@ -534,6 +556,7 @@ impl ParquetMetaDataReader { let start = file_size - footer_metadata_len as u64; Self::decode_metadata( chunk_reader.get_bytes(start, metadata_len)?.as_ref(), + footer.encrypted_footer(), #[cfg(feature = "encryption")] self.file_decryption_properties.as_ref(), ) @@ -557,7 +580,9 @@ impl ParquetMetaDataReader { fetch: &mut F, file_size: usize, prefetch: usize, - file_decryption_properties: Option<&FileDecryptionProperties>, + #[cfg(feature = "encryption")] file_decryption_properties: Option< + &FileDecryptionProperties, + >, ) -> Result<(ParquetMetaData, Option<(usize, Bytes)>)> { if file_size < FOOTER_SIZE { return Err(eof_err!("file size of {} is less than footer", file_size)); @@ -582,7 +607,8 @@ impl ParquetMetaDataReader { let mut footer = [0; FOOTER_SIZE]; footer.copy_from_slice(&suffix[suffix_len - FOOTER_SIZE..suffix_len]); - let length = Self::decode_footer(&footer)?; + let footer = Self::decode_footer_tail(&footer)?; + let length = footer.metadata_length(); if file_size < length + FOOTER_SIZE { return Err(eof_err!( @@ -597,22 +623,32 @@ impl ParquetMetaDataReader { let metadata_start = file_size - length - FOOTER_SIZE; let meta = fetch.fetch(metadata_start..file_size - FOOTER_SIZE).await?; Ok(( - Self::decode_metadata(&meta, file_decryption_properties)?, + Self::decode_metadata( + &meta, + footer.encrypted_footer(), + #[cfg(feature = "encryption")] + file_decryption_properties, + )?, None, )) } else { let metadata_start = file_size - length - FOOTER_SIZE - footer_start; let slice = &suffix[metadata_start..suffix_len - FOOTER_SIZE]; Ok(( - Self::decode_metadata(slice, file_decryption_properties)?, + Self::decode_metadata( + slice, + footer.encrypted_footer(), + #[cfg(feature = "encryption")] + file_decryption_properties, + )?, Some((footer_start, suffix.slice(..metadata_start))), )) } } - /// Decodes the Parquet footer returning the metadata length in bytes + /// Decodes the end of the Parquet footer /// - /// A parquet footer is 8 bytes long and has the following layout: + /// There are 8 bytes at the end of the Parquet footer with the following layout: /// * 4 bytes for the metadata length /// * 4 bytes for the magic bytes 'PAR1' or 'PARE' (encrypted footer) /// @@ -621,16 +657,28 @@ impl ParquetMetaDataReader { /// | len | 'PAR1' or 'PARE' | /// +-----+------------------+ /// ``` - pub fn decode_footer(slice: &[u8; FOOTER_SIZE]) -> Result { - // check this is indeed a parquet file - if slice[4..] != PARQUET_MAGIC && slice[4..] != PARQUET_MAGIC_ENCR_FOOTER { + pub fn decode_footer_tail(slice: &[u8; FOOTER_SIZE]) -> Result { + let magic = &slice[4..]; + let encrypted_footer = if magic == PARQUET_MAGIC_ENCR_FOOTER { + true + } else if magic == PARQUET_MAGIC { + false + } else { return Err(general_err!("Invalid Parquet file. Corrupt footer")); - } - + }; // get the metadata length from the footer let metadata_len = u32::from_le_bytes(slice[..4].try_into().unwrap()); - // u32 won't be larger than usize in most cases - Ok(metadata_len as usize) + Ok(FooterTail { + // u32 won't be larger than usize in most cases + metadata_length: metadata_len as usize, + encrypted_footer, + }) + } + + /// Decodes the Parquet footer, returning the metadata length in bytes + #[deprecated(note = "use decode_footer_tail instead")] + pub fn decode_footer(slice: &[u8; FOOTER_SIZE]) -> Result { + Self::decode_footer_tail(slice).map(|f| f.metadata_length) } /// Decodes [`ParquetMetaData`] from the provided bytes. @@ -642,21 +690,32 @@ impl ParquetMetaDataReader { /// [Parquet Spec]: https://github.com/apache/parquet-format#metadata pub fn decode_metadata( buf: &[u8], + encrypted_footer: bool, #[cfg(feature = "encryption")] file_decryption_properties: Option< &FileDecryptionProperties, >, ) -> Result { let mut prot = TCompactSliceInputProtocol::new(buf); + #[cfg(not(feature = "encryption"))] + if encrypted_footer { + return Err(general_err!( + "Parquet file has an encrypted footer but the encryption feature is disabled" + )); + } + #[cfg(feature = "encryption")] let mut file_decryptor = None; #[cfg(feature = "encryption")] let decrypted_fmd_buf; #[cfg(feature = "encryption")] - if file_decryption_properties.is_some() - && file_decryption_properties.unwrap().has_footer_key() - { + if encrypted_footer { + if file_decryption_properties.is_none() { + return Err(general_err!("Parquet file has an encrypted footer but no decryption properties were provided")); + }; + let file_decryption_properties = file_decryption_properties.unwrap(); + let t_file_crypto_metadata: TFileCryptoMetaData = TFileCryptoMetaData::read_from_in_protocol(&mut prot) .map_err(|e| general_err!("Could not parse crypto metadata: {}", e))?; @@ -678,7 +737,7 @@ impl ParquetMetaDataReader { let aad_prefix: Vec = aes_gcm_algo.aad_prefix.unwrap_or_default(); file_decryptor = Some(FileDecryptor::new( - file_decryption_properties.unwrap(), + file_decryption_properties, aad_file_unique.clone(), aad_prefix.clone(), ));