Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Read larger archives properly (many entries)
Browse files Browse the repository at this point in the history
fasterthanlime committed Feb 5, 2024
1 parent 9fb6d4e commit 9795d55
Showing 1 changed file with 78 additions and 68 deletions.
146 changes: 78 additions & 68 deletions rc-zip/src/fsm/archive.rs
Original file line number Diff line number Diff line change
@@ -251,6 +251,7 @@ impl ArchiveFsm {
"ReadCentralDirectory | process(), available: {}",
self.buffer.available_data()
);
let mut valid_consumed = 0;
let mut input = Partial::new(self.buffer.data());
trace!(
initial_offset = input.as_bytes().offset_from(&self.buffer.data()),
@@ -266,14 +267,16 @@ impl ArchiveFsm {
len = input.len(),
"ReadCentralDirectory | parsed directory header"
);
valid_consumed =
input.as_bytes().offset_from(&self.buffer.data()) as usize;
directory_headers.push(dh.into_owned());
}
Err(ErrMode::Incomplete(_needed)) => {
// need more data to read the full header
trace!("ReadCentralDirectory | incomplete!");
break 'read_headers;
}
Err(ErrMode::Backtrack(_err)) | Err(ErrMode::Cut(_err)) => {
Err(ErrMode::Backtrack(err)) | Err(ErrMode::Cut(err)) => {
// this is the normal end condition when reading
// the central directory (due to 65536-entries non-zip64 files)
// let's just check a few numbers first.
@@ -282,88 +285,95 @@ impl ArchiveFsm {
let expected_records = directory_headers.len() as u16;
let actual_records = eocd.directory_records() as u16;

if expected_records == actual_records {
let mut detectorng = chardetng::EncodingDetector::new();
let mut all_utf8 = true;
let mut had_suspicious_chars_for_cp437 = false;
if expected_records != actual_records {
tracing::trace!(
"error while reading central records: we read {} records, but EOCD announced {}. the last failed with: {err:?} (display: {err}). at that point, input had length {}",
expected_records,
actual_records,
input.len()
);

{
let max_feed: usize = 4096;
let mut total_fed: usize = 0;
let mut feed = |slice: &[u8]| {
detectorng.feed(slice, false);
for b in slice {
if (0xB0..=0xDF).contains(b) {
// those are, like, box drawing characters
had_suspicious_chars_for_cp437 = true;
}
// if we read the wrong number of directory entries,
// error out.
return Err(FormatError::InvalidCentralRecord {
expected: expected_records,
actual: actual_records,
}
.into());
}

let mut detectorng = chardetng::EncodingDetector::new();
let mut all_utf8 = true;
let mut had_suspicious_chars_for_cp437 = false;

{
let max_feed: usize = 4096;
let mut total_fed: usize = 0;
let mut feed = |slice: &[u8]| {
detectorng.feed(slice, false);
for b in slice {
if (0xB0..=0xDF).contains(b) {
// those are, like, box drawing characters
had_suspicious_chars_for_cp437 = true;
}
}

total_fed += slice.len();
total_fed < max_feed
};
total_fed += slice.len();
total_fed < max_feed
};

'recognize_encoding: for fh in
directory_headers.iter().filter(|fh| fh.is_non_utf8())
{
all_utf8 = false;
if !feed(&fh.name[..]) || !feed(&fh.comment[..]) {
break 'recognize_encoding;
}
'recognize_encoding: for fh in
directory_headers.iter().filter(|fh| fh.is_non_utf8())
{
all_utf8 = false;
if !feed(&fh.name[..]) || !feed(&fh.comment[..]) {
break 'recognize_encoding;
}
}
}

let encoding = {
if all_utf8 {
Encoding::Utf8
} else {
let encoding = detectorng.guess(None, true);
if encoding == encoding_rs::SHIFT_JIS {
// well hold on, sometimes Codepage 437 is detected as
// Shift-JIS by chardetng. If we have any characters
// that aren't valid DOS file names, then okay it's probably
// Shift-JIS. Otherwise, assume it's CP437.
if had_suspicious_chars_for_cp437 {
Encoding::ShiftJis
} else {
Encoding::Cp437
}
} else if encoding == encoding_rs::UTF_8 {
Encoding::Utf8
let encoding = {
if all_utf8 {
Encoding::Utf8
} else {
let encoding = detectorng.guess(None, true);
if encoding == encoding_rs::SHIFT_JIS {
// well hold on, sometimes Codepage 437 is detected as
// Shift-JIS by chardetng. If we have any characters
// that aren't valid DOS file names, then okay it's probably
// Shift-JIS. Otherwise, assume it's CP437.
if had_suspicious_chars_for_cp437 {
Encoding::ShiftJis
} else {
Encoding::Cp437
}
} else if encoding == encoding_rs::UTF_8 {
Encoding::Utf8
} else {
Encoding::Cp437
}
};

let global_offset = eocd.global_offset as u64;
let entries: Result<Vec<Entry>, Error> = directory_headers
.iter()
.map(|x| x.as_entry(encoding, global_offset))
.collect();
let entries = entries?;

let comment = encoding.decode(eocd.comment())?;

return Ok(FsmResult::Done(Archive {
size: self.size,
comment,
entries,
encoding,
}));
} else {
// if we read the wrong number of directory entries,
// error out.
return Err(FormatError::InvalidCentralRecord {
expected: expected_records,
actual: actual_records,
}
.into());
}
};

let global_offset = eocd.global_offset as u64;
let entries: Result<Vec<Entry>, Error> = directory_headers
.iter()
.map(|x| x.as_entry(encoding, global_offset))
.collect();
let entries = entries?;

let comment = encoding.decode(eocd.comment())?;

return Ok(FsmResult::Done(Archive {
size: self.size,
comment,
entries,
encoding,
}));
}
}
}
let consumed = input.as_bytes().offset_from(&self.buffer.data());
let consumed = valid_consumed;
tracing::trace!(%consumed, "ReadCentralDirectory total consumed");
self.buffer.consume(consumed);

0 comments on commit 9795d55

Please sign in to comment.