From 2f220132f8f93903ec901e51a6b07e36509839f9 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 11:40:58 +0100 Subject: [PATCH] Clarify state machine terminology --- rc-zip-sync/src/read_zip.rs | 8 ++++---- rc-zip-sync/tests/integration_tests.rs | 8 ++++---- rc-zip-tokio/src/read_zip.rs | 8 ++++---- rc-zip/src/encoding.rs | 10 ++++++++++ rc-zip/src/error.rs | 2 ++ rc-zip/src/format/mod.rs | 7 +++++++ rc-zip/src/{reader => fsm}/archive.rs | 6 +++--- rc-zip/src/fsm/entry.rs | 12 ++++++++++++ rc-zip/src/{reader => fsm}/mod.rs | 12 +++++++++++- rc-zip/src/lib.rs | 21 +-------------------- rc-zip/src/reader/entry.rs | 1 - 11 files changed, 58 insertions(+), 37 deletions(-) rename rc-zip/src/{reader => fsm}/archive.rs (99%) create mode 100644 rc-zip/src/fsm/entry.rs rename rc-zip/src/{reader => fsm}/mod.rs (53%) delete mode 100644 rc-zip/src/reader/entry.rs diff --git a/rc-zip-sync/src/read_zip.rs b/rc-zip-sync/src/read_zip.rs index 5e0522c..fa21a2e 100644 --- a/rc-zip-sync/src/read_zip.rs +++ b/rc-zip-sync/src/read_zip.rs @@ -1,5 +1,5 @@ use rc_zip::{ - reader::{ArchiveReader, ArchiveReaderResult}, + fsm::{ArchiveFsm, FsmResult}, Archive, Error, StoredEntry, }; @@ -41,7 +41,7 @@ where fn read_zip_with_size(&self, size: u64) -> Result, Error> { tracing::trace!(%size, "read_zip_with_size"); - let mut ar = ArchiveReader::new(size); + let mut ar = ArchiveFsm::new(size); loop { if let Some(offset) = ar.wants_read() { tracing::trace!(%offset, "read_zip_with_size: wants_read, space len = {}", ar.space().len()); @@ -58,14 +58,14 @@ where } match ar.process()? { - ArchiveReaderResult::Done(archive) => { + FsmResult::Done(archive) => { tracing::trace!("read_zip_with_size: done"); return Ok(SyncArchive { file: self, archive, }); } - ArchiveReaderResult::Continue => { + FsmResult::Continue => { tracing::trace!("read_zip_with_size: continue"); } } diff --git a/rc-zip-sync/tests/integration_tests.rs b/rc-zip-sync/tests/integration_tests.rs index 8b98bff..4fd962e 100644 --- a/rc-zip-sync/tests/integration_tests.rs +++ b/rc-zip-sync/tests/integration_tests.rs @@ -346,12 +346,12 @@ fn real_world_files() { #[test_log::test] fn state_machine() { - use rc_zip::reader::{ArchiveReader, ArchiveReaderResult}; + use rc_zip::fsm::{ArchiveFsm, FsmResult}; let cases = test_cases(); let case = cases.iter().find(|x| x.name() == "zip64.zip").unwrap(); let bs = case.bytes(); - let mut zar = ArchiveReader::new(bs.len() as u64); + let mut zar = ArchiveFsm::new(bs.len() as u64); let archive = 'read_zip: loop { if let Some(offset) = zar.wants_read() { @@ -381,8 +381,8 @@ fn state_machine() { match zar.process() { Ok(res) => match res { - ArchiveReaderResult::Continue => {} - ArchiveReaderResult::Done(archive) => break 'read_zip archive, + FsmResult::Continue => {} + FsmResult::Done(archive) => break 'read_zip archive, }, Err(err) => { println!("zar processing error: {:#?}", err); diff --git a/rc-zip-tokio/src/read_zip.rs b/rc-zip-tokio/src/read_zip.rs index c16c7cf..438bceb 100644 --- a/rc-zip-tokio/src/read_zip.rs +++ b/rc-zip-tokio/src/read_zip.rs @@ -5,7 +5,7 @@ use positioned_io::{RandomAccessFile, ReadAt}; use tokio::io::{AsyncRead, AsyncReadExt, ReadBuf}; use rc_zip::{ - reader::{ArchiveReader, ArchiveReaderResult}, + fsm::{ArchiveFsm, FsmResult}, Archive, Error, StoredEntry, }; @@ -50,7 +50,7 @@ where type File = F; async fn read_zip_with_size_async(&self, size: u64) -> Result, Error> { - let mut ar = ArchiveReader::new(size); + let mut ar = ArchiveFsm::new(size); loop { if let Some(offset) = ar.wants_read() { match self.cursor_at(offset).read(ar.space()).await { @@ -65,13 +65,13 @@ where } match ar.process()? { - ArchiveReaderResult::Done(archive) => { + FsmResult::Done(archive) => { return Ok(AsyncArchive { file: self, archive, }) } - ArchiveReaderResult::Continue => {} + FsmResult::Continue => {} } } } diff --git a/rc-zip/src/encoding.rs b/rc-zip/src/encoding.rs index 823221d..6d126bc 100644 --- a/rc-zip/src/encoding.rs +++ b/rc-zip/src/encoding.rs @@ -1,3 +1,11 @@ +//! zip entry paths may be encoded in a variety of character encodings. +//! +//! Historically, CP-437 was used, but many modern zip files use UTF-8 with an +//! optional UTF-8 flag. +//! +//! Others use the system's local character encoding, and we have no choice but +//! to make an educated guess thanks to the chardet-ng crate. + use std::fmt; /// Encodings supported by this crate @@ -5,12 +13,14 @@ use std::fmt; pub enum Encoding { /// UTF-8 Utf8, + /// [Codepage 437](https://en.wikipedia.org/wiki/Code_page_437), also known as /// OEM-US, PC-8, or DOS Latin US. /// /// This is the fallback if UTF-8 is not specified and no other encoding /// is auto-detected. It was the original encoding of the zip format. Cp437, + /// [Shift JIS](https://en.wikipedia.org/wiki/Shift_JIS), also known as SJIS. /// /// Still in use by some Japanese users as of 2019. diff --git a/rc-zip/src/error.rs b/rc-zip/src/error.rs index 99bdc29..023454e 100644 --- a/rc-zip/src/error.rs +++ b/rc-zip/src/error.rs @@ -1,3 +1,5 @@ +//! All error types used in this crate + use crate::Method; use super::encoding; diff --git a/rc-zip/src/format/mod.rs b/rc-zip/src/format/mod.rs index 709634f..541edc8 100644 --- a/rc-zip/src/format/mod.rs +++ b/rc-zip/src/format/mod.rs @@ -1,3 +1,10 @@ +//! Contain winnow parsers for most elements that make up a ZIP file, like +//! the end-of-central-directory record, local file headers, and central +//! directory headers. +//! +//! Everything in there is based off of the appnote, which you can find in the +//! source repository. + pub use crate::encoding::Encoding; mod archive; diff --git a/rc-zip/src/reader/archive.rs b/rc-zip/src/fsm/archive.rs similarity index 99% rename from rc-zip/src/reader/archive.rs rename to rc-zip/src/fsm/archive.rs index 75c98a5..cca0c80 100644 --- a/rc-zip/src/reader/archive.rs +++ b/rc-zip/src/fsm/archive.rs @@ -12,10 +12,10 @@ use winnow::{ Parser, Partial, }; -/// ArchiveReader parses a valid zip archive into an [Archive][]. In particular, this struct finds +/// [ArchiveReader] parses a valid zip archive into an [Archive]. In particular, this struct finds /// an end of central directory record, parses the entire central directory, detects text encoding, /// and normalizes metadata. -pub struct ArchiveReader { +pub struct ArchiveFsm { // Size of the entire zip file size: u64, state: State, @@ -71,7 +71,7 @@ impl State { } } -impl ArchiveReader { +impl ArchiveFsm { /// This should be > 65KiB, because the section at the end of the /// file that we check for end of central directory record is 65KiB. const DEFAULT_BUFFER_SIZE: usize = 256 * 1024; diff --git a/rc-zip/src/fsm/entry.rs b/rc-zip/src/fsm/entry.rs new file mode 100644 index 0000000..d4c908a --- /dev/null +++ b/rc-zip/src/fsm/entry.rs @@ -0,0 +1,12 @@ +#[derive(Default)] +enum State { + /// Done! + Done, + + #[default] + Transition, +} + +pub struct EntryFsm { + state: State, +} diff --git a/rc-zip/src/reader/mod.rs b/rc-zip/src/fsm/mod.rs similarity index 53% rename from rc-zip/src/reader/mod.rs rename to rc-zip/src/fsm/mod.rs index 92fea0e..33a0404 100644 --- a/rc-zip/src/reader/mod.rs +++ b/rc-zip/src/fsm/mod.rs @@ -1,3 +1,12 @@ +//! Parsers are just part of the puzzle when it comes to zip files: finding the +//! central directory is non-trivial and involves seeking around the input: +//! [ArchiveFsm] provides a state machine to handle this. +//! +//! Similarly, reading an entry involves reading the local header, then the +//! data (while calculating the CRC32), then the data descriptor, and then +//! checking whether the uncompressed size and CRC32 match the values in the +//! central directory. + macro_rules! transition { ($state: expr => ($pattern: pat) $body: expr) => { $state = if let $pattern = std::mem::take(&mut $state) { @@ -9,9 +18,10 @@ macro_rules! transition { } mod archive; -pub use archive::ArchiveReader; +pub use archive::ArchiveFsm; mod entry; +pub use entry::EntryFsm; /// Indicates whether or not the state machine has completed its work pub enum FsmResult { diff --git a/rc-zip/src/lib.rs b/rc-zip/src/lib.rs index 25224b6..1408bcf 100644 --- a/rc-zip/src/lib.rs +++ b/rc-zip/src/lib.rs @@ -1,22 +1,3 @@ -//! # rc-zip -//! -//! rc-zip is a zip archive library with a focus on compatibility and correctness. -//! -//! ### Reading -//! -//! [ArchiveReader](reader::ArchiveReader) is your first stop. It -//! ensures we are dealing with a valid zip archive, and reads the central -//! directory. It does not perform I/O itself, but rather, it is a state machine -//! that asks for reads at specific offsets. -//! -//! An [Archive] contains a full list of [entries](StoredEntry), -//! which you can then extract. -//! -//! ### Writing -//! -//! Writing archives is not implemented yet. -//! - mod encoding; mod error; @@ -25,4 +6,4 @@ pub use error::*; mod format; pub use format::*; -pub mod reader; +pub mod fsm; diff --git a/rc-zip/src/reader/entry.rs b/rc-zip/src/reader/entry.rs deleted file mode 100644 index 8b13789..0000000 --- a/rc-zip/src/reader/entry.rs +++ /dev/null @@ -1 +0,0 @@ -