-
Notifications
You must be signed in to change notification settings - Fork 833
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Convert some panics that happen on invalid parquet files to error results #6738
base: main
Are you sure you want to change the base?
Changes from 1 commit
f481dff
a4f8286
a88bc81
3c97bbc
a7db494
78994df
55f3f64
2597c9a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -617,7 +617,8 @@ impl ParquetMetaDataReader { | |
for rg in t_file_metadata.row_groups { | ||
row_groups.push(RowGroupMetaData::from_thrift(schema_descr.clone(), rg)?); | ||
} | ||
let column_orders = Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr); | ||
let column_orders = | ||
Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr)?; | ||
Comment on lines
+630
to
+631
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I realize this would currently panic, but would one ever prefer to just set There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good point! i agree with the |
||
|
||
let file_metadata = FileMetaData::new( | ||
t_file_metadata.version, | ||
|
@@ -635,15 +636,13 @@ impl ParquetMetaDataReader { | |
fn parse_column_orders( | ||
t_column_orders: Option<Vec<TColumnOrder>>, | ||
schema_descr: &SchemaDescriptor, | ||
) -> Option<Vec<ColumnOrder>> { | ||
) -> Result<Option<Vec<ColumnOrder>>> { | ||
match t_column_orders { | ||
Some(orders) => { | ||
// Should always be the case | ||
assert_eq!( | ||
orders.len(), | ||
schema_descr.num_columns(), | ||
"Column order length mismatch" | ||
); | ||
if orders.len() != schema_descr.num_columns() { | ||
return Err(general_err!("Column order length mismatch")); | ||
}; | ||
let mut res = Vec::new(); | ||
for (i, column) in schema_descr.columns().iter().enumerate() { | ||
match orders[i] { | ||
|
@@ -657,9 +656,9 @@ impl ParquetMetaDataReader { | |
} | ||
} | ||
} | ||
Some(res) | ||
Ok(Some(res)) | ||
} | ||
None => None, | ||
None => Ok(None), | ||
} | ||
} | ||
} | ||
|
@@ -731,7 +730,7 @@ mod tests { | |
]); | ||
|
||
assert_eq!( | ||
ParquetMetaDataReader::parse_column_orders(t_column_orders, &schema_descr), | ||
ParquetMetaDataReader::parse_column_orders(t_column_orders, &schema_descr).unwrap(), | ||
Some(vec![ | ||
ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED), | ||
ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED) | ||
|
@@ -740,20 +739,21 @@ mod tests { | |
|
||
// Test when no column orders are defined. | ||
assert_eq!( | ||
ParquetMetaDataReader::parse_column_orders(None, &schema_descr), | ||
ParquetMetaDataReader::parse_column_orders(None, &schema_descr).unwrap(), | ||
None | ||
); | ||
} | ||
|
||
#[test] | ||
#[should_panic(expected = "Column order length mismatch")] | ||
fn test_metadata_column_orders_len_mismatch() { | ||
let schema = SchemaType::group_type_builder("schema").build().unwrap(); | ||
let schema_descr = SchemaDescriptor::new(Arc::new(schema)); | ||
|
||
let t_column_orders = Some(vec![TColumnOrder::TYPEORDER(TypeDefinedOrder::new())]); | ||
|
||
ParquetMetaDataReader::parse_column_orders(t_column_orders, &schema_descr); | ||
let res = ParquetMetaDataReader::parse_column_orders(t_column_orders, &schema_descr); | ||
assert!(res.is_err()); | ||
assert!(format!("{:?}", res.unwrap_err()).contains("Column order length mismatch")); | ||
} | ||
|
||
#[test] | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -157,6 +157,32 @@ pub fn from_thrift( | |||||
stats.max_value | ||||||
}; | ||||||
|
||||||
fn check_len(min: &Option<Vec<u8>>, max: &Option<Vec<u8>>, len: usize) -> Result<()> { | ||||||
if let Some(min) = min { | ||||||
if min.len() < len { | ||||||
return Err(ParquetError::General( | ||||||
"Insufficient bytes to parse max statistic".to_string(), | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks for catching this. |
||||||
)); | ||||||
} | ||||||
} | ||||||
if let Some(max) = max { | ||||||
if max.len() < len { | ||||||
return Err(ParquetError::General( | ||||||
"Insufficient bytes to parse max statistic".to_string(), | ||||||
)); | ||||||
} | ||||||
} | ||||||
Ok(()) | ||||||
} | ||||||
|
||||||
match physical_type { | ||||||
Type::BOOLEAN => check_len(&min, &max, 1), | ||||||
Type::INT32 | Type::FLOAT => check_len(&min, &max, 4), | ||||||
Type::INT64 | Type::DOUBLE => check_len(&min, &max, 8), | ||||||
Type::INT96 => check_len(&min, &max, 12), | ||||||
_ => Ok(()), | ||||||
}?; | ||||||
|
||||||
// Values are encoded using PLAIN encoding definition, except that | ||||||
// variable-length byte arrays do not include a length prefix. | ||||||
// | ||||||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1122,6 +1122,10 @@ pub fn from_thrift(elements: &[SchemaElement]) -> Result<TypePtr> { | |
)); | ||
} | ||
|
||
if !schema_nodes[0].is_group() { | ||
return Err(general_err!("Expected root node to be a group type")); | ||
} | ||
|
||
Ok(schema_nodes.remove(0)) | ||
} | ||
|
||
|
@@ -1227,6 +1231,10 @@ fn from_thrift_helper(elements: &[SchemaElement], index: usize) -> Result<(usize | |
if !is_root_node { | ||
builder = builder.with_repetition(rep); | ||
} | ||
} else if !is_root_node { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need this check? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is based on the comment at line 1230 which says There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My concern with this check is unless it is necessary for correctness, there is potential adding it breaks something for someone. Parquet is a very broad ecosystem, lots of writers have interesting interpretations of the specification There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
if it happened then someone must have already experienced panics. But i get your point, removed this change. thx |
||
return Err(general_err!( | ||
"Repetition level must be defined for non-root types" | ||
)); | ||
} | ||
Ok((next_index, Arc::new(builder.build().unwrap()))) | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a breaking API change
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah..is it fine given that it just wraps the return value within Result? The behavior change is just "panic --> error".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No this is a breaking change
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
removed this change. btw i saw that the planned 54.0.0 will have "major api change", how is an api change considered as acceptable breaking change??