diff --git a/rc-zip/src/fsm/archive.rs b/rc-zip/src/fsm/archive.rs index 1118987..656af7d 100644 --- a/rc-zip/src/fsm/archive.rs +++ b/rc-zip/src/fsm/archive.rs @@ -251,6 +251,7 @@ impl ArchiveFsm { "ReadCentralDirectory | process(), available: {}", self.buffer.available_data() ); + let mut valid_consumed = 0; let mut input = Partial::new(self.buffer.data()); trace!( initial_offset = input.as_bytes().offset_from(&self.buffer.data()), @@ -266,6 +267,8 @@ impl ArchiveFsm { len = input.len(), "ReadCentralDirectory | parsed directory header" ); + valid_consumed = + input.as_bytes().offset_from(&self.buffer.data()) as usize; directory_headers.push(dh.into_owned()); } Err(ErrMode::Incomplete(_needed)) => { @@ -273,7 +276,7 @@ impl ArchiveFsm { trace!("ReadCentralDirectory | incomplete!"); break 'read_headers; } - Err(ErrMode::Backtrack(_err)) | Err(ErrMode::Cut(_err)) => { + Err(ErrMode::Backtrack(err)) | Err(ErrMode::Cut(err)) => { // this is the normal end condition when reading // the central directory (due to 65536-entries non-zip64 files) // let's just check a few numbers first. @@ -282,88 +285,95 @@ impl ArchiveFsm { let expected_records = directory_headers.len() as u16; let actual_records = eocd.directory_records() as u16; - if expected_records == actual_records { - let mut detectorng = chardetng::EncodingDetector::new(); - let mut all_utf8 = true; - let mut had_suspicious_chars_for_cp437 = false; + if expected_records != actual_records { + tracing::trace!( + "error while reading central records: we read {} records, but EOCD announced {}. the last failed with: {err:?} (display: {err}). at that point, input had length {}", + expected_records, + actual_records, + input.len() + ); - { - let max_feed: usize = 4096; - let mut total_fed: usize = 0; - let mut feed = |slice: &[u8]| { - detectorng.feed(slice, false); - for b in slice { - if (0xB0..=0xDF).contains(b) { - // those are, like, box drawing characters - had_suspicious_chars_for_cp437 = true; - } + // if we read the wrong number of directory entries, + // error out. + return Err(FormatError::InvalidCentralRecord { + expected: expected_records, + actual: actual_records, + } + .into()); + } + + let mut detectorng = chardetng::EncodingDetector::new(); + let mut all_utf8 = true; + let mut had_suspicious_chars_for_cp437 = false; + + { + let max_feed: usize = 4096; + let mut total_fed: usize = 0; + let mut feed = |slice: &[u8]| { + detectorng.feed(slice, false); + for b in slice { + if (0xB0..=0xDF).contains(b) { + // those are, like, box drawing characters + had_suspicious_chars_for_cp437 = true; } + } - total_fed += slice.len(); - total_fed < max_feed - }; + total_fed += slice.len(); + total_fed < max_feed + }; - 'recognize_encoding: for fh in - directory_headers.iter().filter(|fh| fh.is_non_utf8()) - { - all_utf8 = false; - if !feed(&fh.name[..]) || !feed(&fh.comment[..]) { - break 'recognize_encoding; - } + 'recognize_encoding: for fh in + directory_headers.iter().filter(|fh| fh.is_non_utf8()) + { + all_utf8 = false; + if !feed(&fh.name[..]) || !feed(&fh.comment[..]) { + break 'recognize_encoding; } } + } - let encoding = { - if all_utf8 { - Encoding::Utf8 - } else { - let encoding = detectorng.guess(None, true); - if encoding == encoding_rs::SHIFT_JIS { - // well hold on, sometimes Codepage 437 is detected as - // Shift-JIS by chardetng. If we have any characters - // that aren't valid DOS file names, then okay it's probably - // Shift-JIS. Otherwise, assume it's CP437. - if had_suspicious_chars_for_cp437 { - Encoding::ShiftJis - } else { - Encoding::Cp437 - } - } else if encoding == encoding_rs::UTF_8 { - Encoding::Utf8 + let encoding = { + if all_utf8 { + Encoding::Utf8 + } else { + let encoding = detectorng.guess(None, true); + if encoding == encoding_rs::SHIFT_JIS { + // well hold on, sometimes Codepage 437 is detected as + // Shift-JIS by chardetng. If we have any characters + // that aren't valid DOS file names, then okay it's probably + // Shift-JIS. Otherwise, assume it's CP437. + if had_suspicious_chars_for_cp437 { + Encoding::ShiftJis } else { Encoding::Cp437 } + } else if encoding == encoding_rs::UTF_8 { + Encoding::Utf8 + } else { + Encoding::Cp437 } - }; - - let global_offset = eocd.global_offset as u64; - let entries: Result, Error> = directory_headers - .iter() - .map(|x| x.as_entry(encoding, global_offset)) - .collect(); - let entries = entries?; - - let comment = encoding.decode(eocd.comment())?; - - return Ok(FsmResult::Done(Archive { - size: self.size, - comment, - entries, - encoding, - })); - } else { - // if we read the wrong number of directory entries, - // error out. - return Err(FormatError::InvalidCentralRecord { - expected: expected_records, - actual: actual_records, } - .into()); - } + }; + + let global_offset = eocd.global_offset as u64; + let entries: Result, Error> = directory_headers + .iter() + .map(|x| x.as_entry(encoding, global_offset)) + .collect(); + let entries = entries?; + + let comment = encoding.decode(eocd.comment())?; + + return Ok(FsmResult::Done(Archive { + size: self.size, + comment, + entries, + encoding, + })); } } } - let consumed = input.as_bytes().offset_from(&self.buffer.data()); + let consumed = valid_consumed; tracing::trace!(%consumed, "ReadCentralDirectory total consumed"); self.buffer.consume(consumed);