diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 44ea5b7..d3176aa 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -34,7 +34,7 @@ jobs: cargo doc --all-features --no-deps - name: Run cargo clippy run: | - cargo hack clippy --feature-powerset --group-features deflate,deflate64,lzma,bzip2 + cargo hack clippy --each-feature - name: Run tests and collect coverage run: just ci-test - name: Upload coverage information diff --git a/.vscode/settings.json b/.vscode/settings.json index 3248b76..a36c645 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,13 @@ { - "rust-analyzer.cargo.features": ["default", "lzma", "deflate64", "bzip2", "zstd"] + "rust-analyzer.cargo.features": [ + "rc-zip/corpus", + "deflate", + "deflate64", + "bzip2", + "lzma", + "zstd", + ], + "rust-analyzer.linkedProjects": [ + "./Cargo.toml" + ] } \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 286830c..eaa3efb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "addr2line" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +dependencies = [ + "gimli", +] + [[package]] name = "adler" version = "1.0.2" @@ -86,6 +95,21 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "backtrace" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + [[package]] name = "bumpalo" version = "3.14.0" @@ -98,6 +122,12 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" +[[package]] +name = "bytes" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" + [[package]] name = "bzip2" version = "0.4.4" @@ -277,15 +307,100 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] -name = "flate2" -version = "1.0.28" +name = "futures" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" +checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" dependencies = [ - "crc32fast", - "miniz_oxide", + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" + +[[package]] +name = "futures-executor" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", ] +[[package]] +name = "futures-io" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" + +[[package]] +name = "futures-macro" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" + +[[package]] +name = "futures-task" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" + +[[package]] +name = "futures-util" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "gimli" +version = "0.28.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" + [[package]] name = "hashbrown" version = "0.14.3" @@ -298,6 +413,12 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +[[package]] +name = "hermit-abi" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d3d0e0f38255e7fa3cf31335b3a56f05febd18025f4db5ef7a0cfb4f8da651f" + [[package]] name = "humansize" version = "2.1.3" @@ -463,6 +584,16 @@ dependencies = [ "autocfg", ] +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "num_enum" version = "0.7.2" @@ -490,6 +621,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" +[[package]] +name = "object" +version = "0.32.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +dependencies = [ + "memchr", +] + [[package]] name = "oem_cp" version = "2.0.0" @@ -564,6 +704,12 @@ version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + [[package]] name = "pkg-config" version = "0.3.29" @@ -639,32 +785,58 @@ checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" name = "rc-zip" version = "2.0.1" dependencies = [ - "byteorder", "bzip2", "cfg-if", "chardetng", "chrono", - "clap", "crc32fast", "deflate64", "encoding_rs", - "flate2", - "humansize", - "indicatif", "lzma-rs", + "miniz_oxide", "num_enum", "oem_cp", "oval", - "positioned-io", "pretty-hex", "test-log", "thiserror", "tracing", - "tracing-subscriber", "winnow", "zstd", ] +[[package]] +name = "rc-zip-sync" +version = "2.0.1" +dependencies = [ + "cfg-if", + "chrono", + "clap", + "humansize", + "indicatif", + "oval", + "positioned-io", + "rc-zip", + "test-log", + "tracing", + "tracing-subscriber", +] + +[[package]] +name = "rc-zip-tokio" +version = "2.0.1" +dependencies = [ + "futures", + "oval", + "pin-project-lite", + "positioned-io", + "rc-zip", + "test-log", + "tokio", + "tracing", + "tracing-subscriber", +] + [[package]] name = "regex" version = "1.10.3" @@ -709,6 +881,12 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +[[package]] +name = "rustc-demangle" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" + [[package]] name = "ryu" version = "1.0.16" @@ -761,6 +939,15 @@ version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + [[package]] name = "smallvec" version = "1.13.1" @@ -835,6 +1022,30 @@ dependencies = [ "once_cell", ] +[[package]] +name = "tokio" +version = "1.35.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c89b4efa943be685f629b149f53829423f8f5531ea21249408e8e2f8671ec104" +dependencies = [ + "backtrace", + "bytes", + "num_cpus", + "pin-project-lite", + "tokio-macros", +] + +[[package]] +name = "tokio-macros" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "toml_datetime" version = "0.6.5" diff --git a/Cargo.toml b/Cargo.toml index ad753c6..4d79c02 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,61 +1,7 @@ -[package] -name = "rc-zip" -version = "2.0.1" -description = "zip reading" -repository = "https://github.com/fasterthanlime/rc-zip" -license = "Apache-2.0 OR MIT" -authors = ["Amos Wenger "] -edition = "2021" -readme = "README.md" - -keywords = ["zip", "unzip"] -categories = ["compression"] - -[lib] -name = "rc_zip" -path = "src/lib.rs" - -[[example]] -name = "jean" -path = "examples/jean/src/main.rs" - -[dependencies] -winnow = "0.5.36" -pretty-hex = "0.4.1" -oval = "2.0.0" -chrono = "0.4.33" -encoding_rs = "0.8.33" -crc32fast = "1.3.2" -positioned-io = { version = "0.3.3", optional = true } -tracing = "0.1.40" -oem_cp = "2.0.0" -thiserror = "1.0.56" -chardetng = "0.1.17" -flate2 = { version = "1.0.28", optional = true } -num_enum = "0.7.2" -byteorder = "1.5.0" -cfg-if = "1.0.0" -lzma-rs = { version = "0.3.0", features = ["stream"], optional = true } -deflate64 = { version = "0.1.7", optional = true } -bzip2 = { version = "0.4.4", optional = true } -zstd = { version = "0.13.0", optional = true } - -[features] -default = ["sync", "file", "deflate"] -sync = [] -file = ["positioned-io"] -deflate = ["dep:flate2"] -deflate64 = ["dep:deflate64"] -lzma = ["dep:lzma-rs"] -bzip2 = ["dep:bzip2"] -zstd = ["dep:zstd"] - -[dev-dependencies] -clap = { version = "4.4.18", features = ["derive"] } -humansize = "2.1.3" -indicatif = "0.17.7" -test-log = { version = "0.2.14", default-features = false, features = ["tracing-subscriber", "trace"] } -tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } - -[profile.release] -debug = 1 +[workspace] +resolver = "2" +members = [ + "rc-zip", + "rc-zip-sync", + "rc-zip-tokio", +] diff --git a/Justfile b/Justfile index 03efa18..1e869da 100644 --- a/Justfile +++ b/Justfile @@ -4,12 +4,19 @@ _default: just --list check: - cargo hack clippy --feature-powerset --group-features deflate,deflate64,lzma,bzip2 + cargo hack clippy --each-feature + +docs: + RUSTDOCFLAGS="-D warnings" cargo doc --all-features --no-deps # Run all tests locally test *args: cargo nextest run {{args}} --all-features +# Report unused dependencies: +udeps: + RUSTC_BOOTSTRAP=1 cargo udeps + # Run all tests with nextest and cargo-llvm-cov ci-test: #!/bin/bash -eux diff --git a/codecov.yml b/codecov.yml deleted file mode 100644 index 7b12ce4..0000000 --- a/codecov.yml +++ /dev/null @@ -1,2 +0,0 @@ -ignore: - - "crates/jean/**" \ No newline at end of file diff --git a/examples/jean/.gitignore b/examples/jean/.gitignore deleted file mode 100644 index 53eaa21..0000000 --- a/examples/jean/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -/target -**/*.rs.bk diff --git a/examples/jean/Cargo.toml b/examples/jean/Cargo.toml deleted file mode 100644 index 98dcc24..0000000 --- a/examples/jean/Cargo.toml +++ /dev/null @@ -1,23 +0,0 @@ -[package] -name = "jean" -version.workspace = true -repository.workspace = true -license.workspace = true -authors.workspace = true -edition = "2021" -publish = false - -[dependencies] -rc-zip = { path = "../rc-zip" } -clap = { version = "4.4.18", features = ["derive"] } -humansize = "2.1.3" -positioned-io.workspace = true -indicatif = "0.17.7" -tracing-subscriber = "0.3.18" -cfg-if = "1.0.0" - -[features] -default = ["lzma"] -deflate = ["rc-zip/deflate"] -deflate64 = ["rc-zip/deflate64"] -lzma = ["rc-zip/lzma"] diff --git a/rc-zip-sync/Cargo.toml b/rc-zip-sync/Cargo.toml new file mode 100644 index 0000000..8974690 --- /dev/null +++ b/rc-zip-sync/Cargo.toml @@ -0,0 +1,45 @@ +[package] +name = "rc-zip-sync" +version = "2.0.1" +description = "Synchronous zip reading on top of rc-zip" +repository = "https://github.com/fasterthanlime/rc-zip" +license = "Apache-2.0 or MIT" +authors = ["Amos Wenger "] +edition = "2021" +readme = "README.md" + +keywords = ["zip", "unzip"] +categories = ["compression"] + +[lib] +name = "rc_zip_sync" +path = "src/lib.rs" + +[[example]] +name = "jean" +path = "examples/jean.rs" + +[dependencies] +positioned-io = { version = "0.3.3", optional = true } +rc-zip = { version = "2.0.1", path = "../rc-zip" } +oval = "2.0.0" +tracing = "0.1.40" + +[features] +default = ["file", "deflate"] +file = ["positioned-io"] +deflate = ["rc-zip/deflate"] +deflate64 = ["rc-zip/deflate64"] +lzma = ["rc-zip/lzma"] +bzip2 = ["rc-zip/bzip2"] +zstd = ["rc-zip/zstd"] + +[dev-dependencies] +chrono = "0.4.33" +clap = { version = "4.4.18", features = ["derive"] } +humansize = "2.1.3" +indicatif = "0.17.7" +test-log = { version = "0.2.14", default-features = false, features = ["tracing-subscriber", "trace"] } +tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } +rc-zip = { version = "2.0.1", path = "../rc-zip", features = ["corpus"] } +cfg-if = "1.0.0" diff --git a/rc-zip-sync/README.md b/rc-zip-sync/README.md new file mode 100644 index 0000000..64aeaef --- /dev/null +++ b/rc-zip-sync/README.md @@ -0,0 +1,6 @@ +# rc-zip-sync + +This crate implements zip archive reading using std (synchronous) I/O traits, +like `std::io::Read`. + +See also [rc-zip-tokio](https://crates.io/crates/rc-zip-tokio). \ No newline at end of file diff --git a/examples/jean/src/main.rs b/rc-zip-sync/examples/jean.rs similarity index 96% rename from examples/jean/src/main.rs rename to rc-zip-sync/examples/jean.rs index 841f179..bf613d4 100644 --- a/examples/jean/src/main.rs +++ b/rc-zip-sync/examples/jean.rs @@ -1,7 +1,8 @@ use cfg_if::cfg_if; use clap::{Parser, Subcommand}; use humansize::{format_size, BINARY}; -use rc_zip::{prelude::*, EntryContents}; +use rc_zip::parse::{Archive, EntryContents, Method, Version}; +use rc_zip_sync::ReadZip; use std::{ borrow::Cow, @@ -74,7 +75,7 @@ fn main() { } fn do_main(cli: Cli) -> Result<(), Box> { - fn info(archive: &rc_zip::Archive) { + fn info(archive: &Archive) { if let Some(comment) = archive.comment() { println!("Comment:\n{}", comment); } @@ -83,9 +84,9 @@ fn do_main(cli: Cli) -> Result<(), Box> { println!("Found Zip64 end of central directory locator") } - let mut creator_versions = HashSet::::new(); - let mut reader_versions = HashSet::::new(); - let mut methods = HashSet::::new(); + let mut creator_versions = HashSet::::new(); + let mut reader_versions = HashSet::::new(); + let mut methods = HashSet::::new(); let mut compressed_size: u64 = 0; let mut uncompressed_size: u64 = 0; let mut num_dirs = 0; @@ -96,13 +97,13 @@ fn do_main(cli: Cli) -> Result<(), Box> { creator_versions.insert(entry.creator_version); reader_versions.insert(entry.reader_version); match entry.contents() { - rc_zip::EntryContents::Symlink => { + EntryContents::Symlink => { num_symlinks += 1; } - rc_zip::EntryContents::Directory => { + EntryContents::Directory => { num_dirs += 1; } - rc_zip::EntryContents::File => { + EntryContents::File => { methods.insert(entry.method()); num_files += 1; compressed_size += entry.inner.compressed_size; @@ -159,7 +160,7 @@ fn do_main(cli: Cli) -> Result<(), Box> { gid = Optional(entry.gid), ); - if let rc_zip::EntryContents::Symlink = entry.contents() { + if let EntryContents::Symlink = entry.contents() { let mut target = String::new(); entry.reader().read_to_string(&mut target).unwrap(); print!("\t{target}", target = target); diff --git a/rc-zip-sync/src/entry_reader.rs b/rc-zip-sync/src/entry_reader.rs new file mode 100644 index 0000000..a48a6df --- /dev/null +++ b/rc-zip-sync/src/entry_reader.rs @@ -0,0 +1,62 @@ +use rc_zip::{ + fsm::{EntryFsm, FsmResult}, + parse::StoredEntry, +}; +use std::io; + +pub(crate) struct EntryReader +where + R: io::Read, +{ + rd: R, + fsm: Option, +} + +impl EntryReader +where + R: io::Read, +{ + pub(crate) fn new(entry: &StoredEntry, rd: R) -> Self { + Self { + rd, + fsm: Some(EntryFsm::new(entry.method(), entry.inner)), + } + } +} + +impl io::Read for EntryReader +where + R: io::Read, +{ + fn read(&mut self, buf: &mut [u8]) -> io::Result { + let mut fsm = match self.fsm.take() { + Some(fsm) => fsm, + None => return Ok(0), + }; + + if fsm.wants_read() { + tracing::trace!("fsm wants read"); + let n = self.rd.read(fsm.space())?; + tracing::trace!("giving fsm {} bytes", n); + fsm.fill(n); + } else { + tracing::trace!("fsm does not want read"); + } + + match fsm.process(buf)? { + FsmResult::Continue((fsm, outcome)) => { + self.fsm = Some(fsm); + if outcome.bytes_written > 0 { + Ok(outcome.bytes_written) + } else { + // loop, it happens + self.read(buf) + } + } + FsmResult::Done(()) => { + // neat! + Ok(0) + } + } + } +} diff --git a/rc-zip-sync/src/lib.rs b/rc-zip-sync/src/lib.rs new file mode 100644 index 0000000..304a1dd --- /dev/null +++ b/rc-zip-sync/src/lib.rs @@ -0,0 +1,15 @@ +//! A library for reading zip files synchronously using std I/O traits, +//! built on top of [rc-zip](https://crates.io/crates/rc-zip). +//! +//! See also: +//! +//! * [rc-zip-tokio](https://crates.io/crates/rc-zip-tokio) for using tokio I/O traits + +#![warn(missing_docs)] + +mod entry_reader; +mod read_zip; + +// re-exports +pub use rc_zip; +pub use read_zip::{HasCursor, ReadZip, ReadZipWithSize, SyncArchive, SyncStoredEntry}; diff --git a/src/reader/sync/read_zip.rs b/rc-zip-sync/src/read_zip.rs similarity index 71% rename from src/reader/sync/read_zip.rs rename to rc-zip-sync/src/read_zip.rs index 01db41e..3089090 100644 --- a/src/reader/sync/read_zip.rs +++ b/rc-zip-sync/src/read_zip.rs @@ -1,34 +1,32 @@ -use crate::{ +use rc_zip::{ error::Error, - format::Archive, - reader::{sync::EntryReader, ArchiveReader, ArchiveReaderResult}, + fsm::{ArchiveFsm, FsmResult}, + parse::{Archive, StoredEntry}, }; + +use crate::entry_reader::EntryReader; use std::{io::Read, ops::Deref}; -/// A trait for reading something as a zip archive (blocking I/O model) +/// A trait for reading something as a zip archive /// /// See also [ReadZip]. pub trait ReadZipWithSize { + /// The type of the file to read from. type File: HasCursor; /// Reads self as a zip archive. - /// - /// This functions blocks until the entire archive has been read. - /// It is not compatible with non-blocking or async I/O. fn read_zip_with_size(&self, size: u64) -> Result, Error>; } -/// A trait for reading something as a zip archive (blocking I/O model), -/// when we can tell size from self. +/// A trait for reading something as a zip archive when we can tell size from +/// self. /// /// See also [ReadZipWithSize]. pub trait ReadZip { + /// The type of the file to read from. type File: HasCursor; /// Reads self as a zip archive. - /// - /// This functions blocks until the entire archive has been read. - /// It is not compatible with non-blocking or async I/O. fn read_zip(&self) -> Result, Error>; } @@ -39,27 +37,32 @@ where type File = F; fn read_zip_with_size(&self, size: u64) -> Result, Error> { - let mut ar = ArchiveReader::new(size); + tracing::trace!(%size, "read_zip_with_size"); + let mut fsm = ArchiveFsm::new(size); loop { - if let Some(offset) = ar.wants_read() { - match ar.read(&mut self.cursor_at(offset)) { + if let Some(offset) = fsm.wants_read() { + tracing::trace!(%offset, "read_zip_with_size: wants_read, space len = {}", fsm.space().len()); + match self.cursor_at(offset).read(fsm.space()) { Ok(read_bytes) => { + tracing::trace!(%read_bytes, "read_zip_with_size: read"); if read_bytes == 0 { return Err(Error::IO(std::io::ErrorKind::UnexpectedEof.into())); } + fsm.fill(read_bytes); } Err(err) => return Err(Error::IO(err)), } } - match ar.process()? { - ArchiveReaderResult::Done(archive) => { + fsm = match fsm.process()? { + FsmResult::Done(archive) => { + tracing::trace!("read_zip_with_size: done"); return Ok(SyncArchive { file: self, archive, - }) + }); } - ArchiveReaderResult::Continue => {} + FsmResult::Continue(fsm) => fsm, } } } @@ -81,6 +84,11 @@ impl ReadZip for Vec { } } +/// A zip archive, read synchronously from a file or other I/O resource. +/// +/// This only contains metadata for the archive and its entries. Separate +/// readers can be created for arbitraries entries on-demand using +/// [SyncStoredEntry::reader]. pub struct SyncArchive<'a, F> where F: HasCursor, @@ -115,8 +123,8 @@ where /// Attempts to look up an entry by name. This is usually a bad idea, /// as names aren't necessarily normalized in zip archives. pub fn by_name>(&self, name: N) -> Option> { - self.entries - .iter() + self.archive + .entries() .find(|&x| x.name() == name.as_ref()) .map(|entry| SyncStoredEntry { file: self.file, @@ -125,13 +133,14 @@ where } } +/// A zip entry, read synchronously from a file or other I/O resource. pub struct SyncStoredEntry<'a, F> { file: &'a F, - entry: &'a crate::StoredEntry, + entry: &'a StoredEntry, } impl Deref for SyncStoredEntry<'_, F> { - type Target = crate::StoredEntry; + type Target = StoredEntry; fn deref(&self) -> &Self::Target { self.entry @@ -143,9 +152,8 @@ where F: HasCursor, { /// Returns a reader for the entry. - pub fn reader(&self) -> EntryReader<::Cursor<'a>> { - tracing::trace!("Creating EntryReader"); - EntryReader::new(self.entry, |offset| self.file.cursor_at(offset)) + pub fn reader(&self) -> impl Read + 'a { + EntryReader::new(self.entry, self.file.cursor_at(self.entry.header_offset)) } /// Reads the entire entry into a vector. @@ -158,6 +166,7 @@ where /// A sliceable I/O resource: we can ask for a [Read] at a given offset. pub trait HasCursor { + /// The type of [Read] returned by [HasCursor::cursor_at]. type Cursor<'a>: Read + 'a where Self: 'a; diff --git a/rc-zip-sync/tests/integration_tests.rs b/rc-zip-sync/tests/integration_tests.rs new file mode 100644 index 0000000..a459318 --- /dev/null +++ b/rc-zip-sync/tests/integration_tests.rs @@ -0,0 +1,51 @@ +use rc_zip::{ + corpus::{self, zips_dir, Case}, + error::Error, + parse::Archive, +}; +use rc_zip_sync::{HasCursor, ReadZip, SyncArchive}; + +use std::fs::File; + +fn check_case(test: &Case, archive: Result, Error>) { + corpus::check_case(test, archive.as_ref().map(|ar| -> &Archive { ar })); + let archive = match archive { + Ok(archive) => archive, + Err(_) => return, + }; + + for file in &test.files { + let entry = archive + .by_name(file.name) + .unwrap_or_else(|| panic!("entry {} should exist", file.name)); + + corpus::check_file_against(file, &entry, &entry.bytes().unwrap()[..]) + } +} + +#[test_log::test] +fn read_from_slice() { + let bytes = std::fs::read(zips_dir().join("test.zip")).unwrap(); + let slice = &bytes[..]; + let archive = slice.read_zip().unwrap(); + assert_eq!(archive.entries().count(), 2); +} + +#[test_log::test] +fn read_from_file() { + let f = File::open(zips_dir().join("test.zip")).unwrap(); + let archive = f.read_zip().unwrap(); + assert_eq!(archive.entries().count(), 2); +} + +#[test_log::test] +fn real_world_files() { + for case in corpus::test_cases() { + tracing::info!("============ testing {}", case.name); + + let file = File::open(case.absolute_path()).unwrap(); + let archive = file.read_zip().map_err(Error::from); + + check_case(&case, archive) + } +} diff --git a/rc-zip-tokio/Cargo.toml b/rc-zip-tokio/Cargo.toml new file mode 100644 index 0000000..ee65588 --- /dev/null +++ b/rc-zip-tokio/Cargo.toml @@ -0,0 +1,39 @@ +[package] +name = "rc-zip-tokio" +version = "2.0.1" +description = "Asynchronous zip reading on top of rc-zip (for tokio I/O traits)" +repository = "https://github.com/fasterthanlime/rc-zip" +license = "Apache-2.0 or MIT" +authors = ["Amos Wenger "] +edition = "2021" +readme = "README.md" + +keywords = ["zip", "unzip"] +categories = ["compression"] + +[lib] +name = "rc_zip_tokio" +path = "src/lib.rs" + +[dependencies] +rc-zip = { version = "2.0.1", path = "../rc-zip" } +positioned-io = { version = "0.3.3" } +tokio = { version = "1.35.1", features = ["fs", "io-util", "rt-multi-thread"] } +futures = { version = "0.3.30" } +pin-project-lite = { version = "0.2.13" } +oval = "2.0.0" +tracing = "0.1.40" + +[features] +default = ["deflate"] +deflate = ["rc-zip/deflate"] +deflate64 = ["rc-zip/deflate64"] +lzma = ["rc-zip/lzma"] +bzip2 = ["rc-zip/bzip2"] +zstd = ["rc-zip/zstd"] + +[dev-dependencies] +test-log = { version = "0.2.14", default-features = false, features = ["tracing-subscriber", "trace"] } +tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } +rc-zip = { version = "2.0.1", path = "../rc-zip", features = ["corpus"] } +tokio = { version = "1.35.1", features = ["rt", "macros"] } \ No newline at end of file diff --git a/rc-zip-tokio/README.md b/rc-zip-tokio/README.md new file mode 100644 index 0000000..2a9196e --- /dev/null +++ b/rc-zip-tokio/README.md @@ -0,0 +1,7 @@ +# rc-zip-tokio + +This crate implements zip archive reading using tokio (asynchronous) I/O traits, +like `tokio:io::AsyncRead`. + +See also [rc-zip-sync](https://crates.io/crates/rc-zip-sync). + diff --git a/rc-zip-tokio/src/async_read_zip.rs b/rc-zip-tokio/src/async_read_zip.rs new file mode 100644 index 0000000..bc68be0 --- /dev/null +++ b/rc-zip-tokio/src/async_read_zip.rs @@ -0,0 +1,294 @@ +use std::{io, ops::Deref, pin::Pin, sync::Arc, task}; + +use futures::future::BoxFuture; +use positioned_io::{RandomAccessFile, ReadAt, Size}; +use tokio::io::{AsyncRead, AsyncReadExt, ReadBuf}; + +use rc_zip::{ + error::Error, + fsm::{ArchiveFsm, FsmResult}, + parse::{Archive, StoredEntry}, +}; + +use crate::entry_reader::EntryReader; + +/// A trait for reading something as a zip archive. +/// +/// See also [ReadZipAsync]. +pub trait ReadZipWithSizeAsync { + /// The type of the file to read from. + type File: HasAsyncCursor; + + /// Reads self as a zip archive. + #[allow(async_fn_in_trait)] + async fn read_zip_with_size_async( + &self, + size: u64, + ) -> Result, Error>; +} + +/// A zip archive, read asynchronously from a file or other I/O resource. +/// +/// This only contains metadata for the archive and its entries. Separate +/// readers can be created for arbitraries entries on-demand using +/// [AsyncStoredEntry::reader]. +pub trait ReadZipAsync { + /// The type of the file to read from. + type File: HasAsyncCursor; + + /// Reads self as a zip archive. + #[allow(async_fn_in_trait)] + async fn read_zip_async(&self) -> Result, Error>; +} + +impl ReadZipWithSizeAsync for F +where + F: HasAsyncCursor, +{ + type File = F; + + async fn read_zip_with_size_async(&self, size: u64) -> Result, Error> { + let mut fsm = ArchiveFsm::new(size); + loop { + if let Some(offset) = fsm.wants_read() { + match self.cursor_at(offset).read(fsm.space()).await { + Ok(read_bytes) => { + if read_bytes == 0 { + return Err(Error::IO(io::ErrorKind::UnexpectedEof.into())); + } + fsm.fill(read_bytes); + } + Err(err) => return Err(Error::IO(err)), + } + } + + fsm = match fsm.process()? { + FsmResult::Done(archive) => { + return Ok(AsyncArchive { + file: self, + archive, + }) + } + FsmResult::Continue(fsm) => fsm, + } + } + } +} + +impl ReadZipAsync for &[u8] { + type File = Self; + + async fn read_zip_async(&self) -> Result, Error> { + self.read_zip_with_size_async(self.len() as u64).await + } +} + +impl ReadZipAsync for Vec { + type File = Self; + + async fn read_zip_async(&self) -> Result, Error> { + self.read_zip_with_size_async(self.len() as u64).await + } +} + +impl ReadZipAsync for Arc { + type File = Self; + + async fn read_zip_async(&self) -> Result, Error> { + let size = self.size()?.unwrap_or_default(); + self.read_zip_with_size_async(size).await + } +} + +/// A zip archive, read asynchronously from a file or other I/O resource. +pub struct AsyncArchive<'a, F> +where + F: HasAsyncCursor, +{ + file: &'a F, + archive: Archive, +} + +impl Deref for AsyncArchive<'_, F> +where + F: HasAsyncCursor, +{ + type Target = Archive; + + fn deref(&self) -> &Self::Target { + &self.archive + } +} + +impl AsyncArchive<'_, F> +where + F: HasAsyncCursor, +{ + /// Iterate over all files in this zip, read from the central directory. + pub fn entries(&self) -> impl Iterator> { + self.archive.entries().map(move |entry| AsyncStoredEntry { + file: self.file, + entry, + }) + } + + /// Attempts to look up an entry by name. This is usually a bad idea, + /// as names aren't necessarily normalized in zip archives. + pub fn by_name>(&self, name: N) -> Option> { + self.archive + .entries() + .find(|&x| x.name() == name.as_ref()) + .map(|entry| AsyncStoredEntry { + file: self.file, + entry, + }) + } +} + +/// A single entry in a zip archive, read asynchronously from a file or other I/O resource. +pub struct AsyncStoredEntry<'a, F> { + file: &'a F, + entry: &'a StoredEntry, +} + +impl Deref for AsyncStoredEntry<'_, F> { + type Target = StoredEntry; + + fn deref(&self) -> &Self::Target { + self.entry + } +} + +impl<'a, F> AsyncStoredEntry<'a, F> +where + F: HasAsyncCursor, +{ + /// Returns a reader for the entry. + pub fn reader(&self) -> impl AsyncRead + Unpin + '_ { + EntryReader::new(self.entry, |offset| self.file.cursor_at(offset)) + } + + /// Reads the entire entry into a vector. + pub async fn bytes(&self) -> io::Result> { + let mut v = Vec::new(); + self.reader().read_to_end(&mut v).await?; + Ok(v) + } +} + +/// A sliceable I/O resource: we can ask for an [AsyncRead] at a given offset. +pub trait HasAsyncCursor { + /// The type returned by [HasAsyncCursor::cursor_at]. + type Cursor<'a>: AsyncRead + Unpin + 'a + where + Self: 'a; + + /// Returns an [AsyncRead] at the given offset. + fn cursor_at(&self, offset: u64) -> Self::Cursor<'_>; +} + +impl HasAsyncCursor for &[u8] { + type Cursor<'a> = &'a [u8] + where + Self: 'a; + + fn cursor_at(&self, offset: u64) -> Self::Cursor<'_> { + &self[offset.try_into().unwrap()..] + } +} + +impl HasAsyncCursor for Vec { + type Cursor<'a> = &'a [u8] + where + Self: 'a; + + fn cursor_at(&self, offset: u64) -> Self::Cursor<'_> { + &self[offset.try_into().unwrap()..] + } +} + +impl HasAsyncCursor for Arc { + type Cursor<'a> = AsyncRandomAccessFileCursor + where + Self: 'a; + + fn cursor_at(&self, offset: u64) -> Self::Cursor<'_> { + AsyncRandomAccessFileCursor { + pos: offset, + state: ARAFCState::Idle(ARAFCCore { + inner_buf: vec![0u8; 128 * 1024], + file: self.clone(), + }), + } + } +} + +struct ARAFCCore { + inner_buf: Vec, + file: Arc, +} + +type JoinResult = Result; + +#[derive(Default)] +enum ARAFCState { + Idle(ARAFCCore), + Reading { + fut: BoxFuture<'static, JoinResult<(Result, ARAFCCore)>>, + }, + + #[default] + Transitioning, +} + +/// A cursor for reading from a [RandomAccessFile] asynchronously. +pub struct AsyncRandomAccessFileCursor { + pos: u64, + state: ARAFCState, +} + +impl AsyncRead for AsyncRandomAccessFileCursor { + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut task::Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> task::Poll> { + match &mut self.state { + ARAFCState::Idle { .. } => { + let mut core = match std::mem::take(&mut self.state) { + ARAFCState::Idle(core) => core, + _ => unreachable!(), + }; + let pos = self.pos; + let fut = Box::pin(tokio::task::spawn_blocking(move || { + let read = core.file.read_at(pos, &mut core.inner_buf); + (read, core) + })); + self.state = ARAFCState::Reading { fut }; + self.poll_read(cx, buf) + } + ARAFCState::Reading { fut } => { + let (read, core) = match fut.as_mut().poll(cx) { + task::Poll::Ready(Ok(r)) => r, + task::Poll::Ready(Err(e)) => { + return task::Poll::Ready(Err(io::Error::new( + io::ErrorKind::Other, + e.to_string(), + ))) + } + task::Poll::Pending => return task::Poll::Pending, + }; + match read { + Ok(read) => { + self.pos += read as u64; + buf.put_slice(&core.inner_buf[..read]); + self.state = ARAFCState::Idle(core); + task::Poll::Ready(Ok(())) + } + Err(e) => task::Poll::Ready(Err(e)), + } + } + ARAFCState::Transitioning => unreachable!(), + } + } +} diff --git a/rc-zip-tokio/src/entry_reader.rs b/rc-zip-tokio/src/entry_reader.rs new file mode 100644 index 0000000..c4af59b --- /dev/null +++ b/rc-zip-tokio/src/entry_reader.rs @@ -0,0 +1,87 @@ +use std::{pin::Pin, task}; + +use pin_project_lite::pin_project; +use rc_zip::{ + fsm::{EntryFsm, FsmResult}, + parse::StoredEntry, +}; +use tokio::io::{AsyncRead, ReadBuf}; + +pin_project! { + pub(crate) struct EntryReader + where + R: AsyncRead, + { + #[pin] + rd: R, + fsm: Option, + } +} + +impl EntryReader +where + R: AsyncRead, +{ + pub(crate) fn new(entry: &StoredEntry, get_reader: F) -> Self + where + F: Fn(u64) -> R, + { + Self { + rd: get_reader(entry.header_offset), + fsm: Some(EntryFsm::new(entry.method(), entry.inner)), + } + } +} + +impl AsyncRead for EntryReader +where + R: AsyncRead, +{ + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut task::Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> task::Poll> { + let this = self.as_mut().project(); + + let mut fsm = match this.fsm.take() { + Some(fsm) => fsm, + None => return Ok(()).into(), + }; + + if fsm.wants_read() { + tracing::trace!(space_avail = fsm.space().len(), "fsm wants read"); + let mut buf = ReadBuf::new(fsm.space()); + match this.rd.poll_read(cx, &mut buf) { + task::Poll::Ready(res) => res?, + task::Poll::Pending => { + *this.fsm = Some(fsm); + return task::Poll::Pending; + } + } + let n = buf.filled().len(); + + tracing::trace!("read {} bytes", n); + fsm.fill(n); + } else { + tracing::trace!("fsm does not want read"); + } + + match fsm.process(buf.initialize_unfilled())? { + FsmResult::Continue((fsm, outcome)) => { + *this.fsm = Some(fsm); + if outcome.bytes_written > 0 { + tracing::trace!("wrote {} bytes", outcome.bytes_written); + buf.advance(outcome.bytes_written); + } else { + // loop, it happens + return self.poll_read(cx, buf); + } + } + FsmResult::Done(()) => { + // neat! + } + } + Ok(()).into() + } +} diff --git a/rc-zip-tokio/src/lib.rs b/rc-zip-tokio/src/lib.rs new file mode 100644 index 0000000..8666c73 --- /dev/null +++ b/rc-zip-tokio/src/lib.rs @@ -0,0 +1,17 @@ +//! A library for reading zip files asynchronously using tokio I/O traits, +//! based on top of [rc-zip](https://crates.io/crates/rc-zip). +//! +//! See also: +//! +//! * [rc-zip-sync](https://crates.io/crates/rc-zip-sync) for using std I/O traits + +#![warn(missing_docs)] + +mod async_read_zip; +mod entry_reader; + +// re-exports +pub use async_read_zip::{ + AsyncArchive, AsyncStoredEntry, HasAsyncCursor, ReadZipAsync, ReadZipWithSizeAsync, +}; +pub use rc_zip; diff --git a/rc-zip-tokio/tests/integration_tests.rs b/rc-zip-tokio/tests/integration_tests.rs new file mode 100644 index 0000000..7cf49c2 --- /dev/null +++ b/rc-zip-tokio/tests/integration_tests.rs @@ -0,0 +1,52 @@ +use positioned_io::RandomAccessFile; +use rc_zip::{ + corpus::{self, zips_dir, Case}, + error::Error, + parse::Archive, +}; +use rc_zip_tokio::{AsyncArchive, HasAsyncCursor, ReadZipAsync}; + +use std::sync::Arc; + +async fn check_case(test: &Case, archive: Result, Error>) { + corpus::check_case(test, archive.as_ref().map(|ar| -> &Archive { ar })); + let archive = match archive { + Ok(archive) => archive, + Err(_) => return, + }; + + for file in &test.files { + let entry = archive + .by_name(file.name) + .unwrap_or_else(|| panic!("entry {} should exist", file.name)); + + corpus::check_file_against(file, &entry, &entry.bytes().await.unwrap()[..]) + } +} + +#[test_log::test(tokio::test)] +async fn read_from_slice() { + let bytes = std::fs::read(zips_dir().join("test.zip")).unwrap(); + let slice = &bytes[..]; + let archive = slice.read_zip_async().await.unwrap(); + assert_eq!(archive.entries().count(), 2); +} + +#[test_log::test(tokio::test)] +async fn read_from_file() { + let f = Arc::new(RandomAccessFile::open(zips_dir().join("test.zip")).unwrap()); + let archive = f.read_zip_async().await.unwrap(); + assert_eq!(archive.entries().count(), 2); +} + +#[test_log::test(tokio::test)] +async fn real_world_files() { + for case in corpus::test_cases() { + tracing::info!("============ testing {}", case.name); + + let file = Arc::new(RandomAccessFile::open(case.absolute_path()).unwrap()); + let archive = file.read_zip_async().await; + + check_case(&case, archive).await + } +} diff --git a/rc-zip/Cargo.toml b/rc-zip/Cargo.toml new file mode 100644 index 0000000..9177949 --- /dev/null +++ b/rc-zip/Cargo.toml @@ -0,0 +1,46 @@ +[package] +name = "rc-zip" +version = "2.0.1" +description = "An I/O-agnostic implementation of the zip file format" +repository = "https://github.com/fasterthanlime/rc-zip" +license = "Apache-2.0 OR MIT" +authors = ["Amos Wenger "] +edition = "2021" +readme = "../README.md" + +keywords = ["zip", "unzip"] +categories = ["compression"] + +[lib] +name = "rc_zip" +path = "src/lib.rs" + +[dependencies] +winnow = "0.5.36" +pretty-hex = "0.4.1" +oval = "2.0.0" +chrono = "0.4.33" +encoding_rs = "0.8.33" +tracing = "0.1.40" +oem_cp = "2.0.0" +thiserror = "1.0.56" +chardetng = "0.1.17" +num_enum = "0.7.2" +cfg-if = "1.0.0" +crc32fast = "1.3.2" +miniz_oxide = { version = "0.7.1", optional = true } +deflate64 = { version = "0.1.7", optional = true } +bzip2 = { version = "0.4.4", optional = true } +lzma-rs = { version = "0.3.0", optional = true, features = ["stream"] } +zstd = { version = "0.13.0", optional = true } + +[features] +corpus = [] +deflate = ["dep:miniz_oxide"] +deflate64 = ["dep:deflate64"] +bzip2 = ["dep:bzip2"] +lzma = ["dep:lzma-rs"] +zstd = ["dep:zstd"] + +[dev-dependencies] +test-log = { version = "0.2.14", default-features = false, features = ["tracing-subscriber", "trace"] } diff --git a/rc-zip/README.md b/rc-zip/README.md new file mode 100644 index 0000000..996cdde --- /dev/null +++ b/rc-zip/README.md @@ -0,0 +1,7 @@ +# rc-zip + +This is the core rc-zip crate, containing types, parses, and state machines, +and that doesn't do any I/O by itself. + +The full README for this crate is the [top-level README](../README.md) in this +repository. diff --git a/rc-zip/src/corpus/mod.rs b/rc-zip/src/corpus/mod.rs new file mode 100644 index 0000000..2cc8cc9 --- /dev/null +++ b/rc-zip/src/corpus/mod.rs @@ -0,0 +1,290 @@ +#![allow(missing_docs)] + +//! A corpus of zip files for testing. + +use std::path::PathBuf; + +use chrono::{DateTime, FixedOffset, TimeZone, Timelike, Utc}; + +use crate::{ + encoding::Encoding, + error::Error, + parse::{Archive, EntryContents, StoredEntry}, +}; + +pub struct Case { + pub name: &'static str, + pub expected_encoding: Option, + pub comment: Option<&'static str>, + pub files: Vec, + pub error: Option, +} + +impl Default for Case { + fn default() -> Self { + Self { + name: "test.zip", + expected_encoding: None, + comment: None, + files: vec![], + error: None, + } + } +} + +impl Case { + pub fn absolute_path(&self) -> PathBuf { + zips_dir().join(self.name) + } +} + +pub struct CaseFile { + pub name: &'static str, + pub mode: Option, + pub modified: Option>, + pub content: FileContent, +} + +pub enum FileContent { + Unchecked, + Bytes(Vec), + File(&'static str), +} + +impl Default for CaseFile { + fn default() -> Self { + Self { + name: "default", + mode: None, + modified: None, + content: FileContent::Unchecked, + } + } +} + +pub fn zips_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .unwrap() + .join("testdata") +} + +fn time_zone(hours: i32) -> FixedOffset { + FixedOffset::east_opt(hours * 3600).unwrap() +} + +fn date( + (year, month, day): (i32, u32, u32), + (hour, min, sec): (u32, u32, u32), + nsec: u32, + offset: FixedOffset, +) -> Option> { + Some( + offset + .with_ymd_and_hms(year, month, day, hour, min, sec) + .single()? + .with_nanosecond(nsec)? + .into(), + ) +} + +pub fn test_cases() -> Vec { + vec![ + Case { + name: "zip64.zip", + files: vec![CaseFile { + name: "README", + content: FileContent::Bytes( + "This small file is in ZIP64 format.\n".as_bytes().into(), + ), + modified: Some(date((2012, 8, 10), (14, 33, 32), 0, time_zone(0)).unwrap()), + mode: Some(0o644), + }], + ..Default::default() + }, + Case { + name: "test.zip", + comment: Some("This is a zipfile comment."), + expected_encoding: Some(Encoding::Utf8), + files: vec![ + CaseFile { + name: "test.txt", + content: FileContent::Bytes("This is a test text file.\n".as_bytes().into()), + modified: Some(date((2010, 9, 5), (12, 12, 1), 0, time_zone(10)).unwrap()), + mode: Some(0o644), + }, + CaseFile { + name: "gophercolor16x16.png", + content: FileContent::File("gophercolor16x16.png"), + modified: Some(date((2010, 9, 5), (15, 52, 58), 0, time_zone(10)).unwrap()), + mode: Some(0o644), + }, + ], + ..Default::default() + }, + Case { + name: "cp-437.zip", + expected_encoding: Some(Encoding::Cp437), + files: vec![CaseFile { + name: "français", + ..Default::default() + }], + ..Default::default() + }, + Case { + name: "shift-jis.zip", + expected_encoding: Some(Encoding::ShiftJis), + files: vec![ + CaseFile { + name: "should-be-jis/", + ..Default::default() + }, + CaseFile { + name: "should-be-jis/ot_運命のワルツネぞなぞ小さな楽しみ遊びま.longboi", + ..Default::default() + }, + ], + ..Default::default() + }, + Case { + name: "utf8-winrar.zip", + expected_encoding: Some(Encoding::Utf8), + files: vec![CaseFile { + name: "世界", + content: FileContent::Bytes(vec![]), + modified: Some(date((2017, 11, 6), (21, 9, 27), 867862500, time_zone(0)).unwrap()), + ..Default::default() + }], + ..Default::default() + }, + #[cfg(feature = "lzma")] + Case { + name: "found-me-lzma.zip", + expected_encoding: Some(Encoding::Utf8), + files: vec![CaseFile { + name: "found-me.txt", + content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), + modified: Some(date((2024, 1, 26), (16, 14, 35), 46003100, time_zone(0)).unwrap()), + ..Default::default() + }], + ..Default::default() + }, + #[cfg(feature = "deflate64")] + Case { + name: "found-me-deflate64.zip", + expected_encoding: Some(Encoding::Utf8), + files: vec![CaseFile { + name: "found-me.txt", + content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), + modified: Some(date((2024, 1, 26), (16, 14, 35), 46003100, time_zone(0)).unwrap()), + ..Default::default() + }], + ..Default::default() + }, + // same with bzip2 + #[cfg(feature = "bzip2")] + Case { + name: "found-me-bzip2.zip", + expected_encoding: Some(Encoding::Utf8), + files: vec![CaseFile { + name: "found-me.txt", + content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), + modified: Some(date((2024, 1, 26), (16, 14, 35), 46003100, time_zone(0)).unwrap()), + ..Default::default() + }], + ..Default::default() + }, + // same with zstd + #[cfg(feature = "zstd")] + Case { + name: "found-me-zstd.zip", + expected_encoding: Some(Encoding::Utf8), + files: vec![CaseFile { + name: "found-me.txt", + content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), + modified: Some(date((2024, 1, 31), (6, 10, 25), 800491400, time_zone(0)).unwrap()), + ..Default::default() + }], + ..Default::default() + }, + ] +} + +pub fn check_case(test: &Case, archive: Result<&Archive, &Error>) { + let case_bytes = std::fs::read(test.absolute_path()).unwrap(); + + if let Some(expected) = &test.error { + let actual = match archive { + Err(e) => e, + Ok(_) => panic!("should have failed"), + }; + let expected = format!("{:#?}", expected); + let actual = format!("{:#?}", actual); + assert_eq!(expected, actual); + return; + } + let archive = archive.unwrap(); + + assert_eq!(case_bytes.len() as u64, archive.size()); + + if let Some(expected) = test.comment { + assert_eq!(expected, archive.comment().expect("should have comment")) + } + + if let Some(exp_encoding) = test.expected_encoding { + assert_eq!(archive.encoding(), exp_encoding); + } + + assert_eq!( + test.files.len(), + archive.entries().count(), + "{} should have {} entries files", + test.name, + test.files.len() + ); + + // then each implementation should check individual files +} + +pub fn check_file_against(file: &CaseFile, entry: &StoredEntry, actual_bytes: &[u8]) { + if let Some(expected) = file.modified { + assert_eq!( + expected, + entry.modified(), + "entry {} should have modified = {:?}", + entry.name(), + expected + ) + } + + if let Some(mode) = file.mode { + assert_eq!(entry.mode.0 & 0o777, mode); + } + + // I have honestly yet to see a zip file _entry_ with a comment. + assert!(entry.comment().is_none()); + + match entry.contents() { + EntryContents::File => { + match &file.content { + FileContent::Unchecked => { + // ah well + } + FileContent::Bytes(expected_bytes) => { + // first check length + assert_eq!(actual_bytes.len(), expected_bytes.len()); + assert_eq!(actual_bytes, &expected_bytes[..]) + } + FileContent::File(file_path) => { + let expected_bytes = std::fs::read(zips_dir().join(file_path)).unwrap(); + // first check length + assert_eq!(actual_bytes.len(), expected_bytes.len()); + assert_eq!(actual_bytes, &expected_bytes[..]) + } + } + } + EntryContents::Symlink | EntryContents::Directory => { + assert!(matches!(file.content, FileContent::Unchecked)); + } + } +} diff --git a/src/encoding.rs b/rc-zip/src/encoding.rs similarity index 88% rename from src/encoding.rs rename to rc-zip/src/encoding.rs index 823221d..1d2c1ec 100644 --- a/src/encoding.rs +++ b/rc-zip/src/encoding.rs @@ -1,16 +1,27 @@ +//! Character encodings used in ZIP files. +//! +//! ZIP entry paths may be encoded in a variety of character encodings: +//! historically, CP-437 was used, but many modern zip files use UTF-8 with an +//! optional UTF-8 flag. +//! +//! Others use the system's local character encoding, and we have no choice but +//! to make an educated guess thanks to the chardet-ng crate. + use std::fmt; /// Encodings supported by this crate #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum Encoding { - /// UTF-8 + /// [UTF-8](https://en.wikipedia.org/wiki/UTF-8), opt-in for ZIP files. Utf8, + /// [Codepage 437](https://en.wikipedia.org/wiki/Code_page_437), also known as /// OEM-US, PC-8, or DOS Latin US. /// /// This is the fallback if UTF-8 is not specified and no other encoding /// is auto-detected. It was the original encoding of the zip format. Cp437, + /// [Shift JIS](https://en.wikipedia.org/wiki/Shift_JIS), also known as SJIS. /// /// Still in use by some Japanese users as of 2019. @@ -42,6 +53,7 @@ pub enum DecodingError { #[error("text too large to be converted")] StringTooLarge, + /// Text is not valid in the given encoding. #[error("encoding error: {0}")] EncodingError(&'static str), } diff --git a/src/error.rs b/rc-zip/src/error.rs similarity index 66% rename from src/error.rs rename to rc-zip/src/error.rs index dafa1a9..0ba94f9 100644 --- a/src/error.rs +++ b/rc-zip/src/error.rs @@ -1,4 +1,6 @@ -use crate::Method; +//! All error types used in this crate + +use crate::parse::Method; use super::encoding; @@ -21,35 +23,60 @@ pub enum Error { #[error("io: {0}")] IO(#[from] std::io::Error), + /// Decompression-related error + #[error("{method:?} decompression error: {msg}")] + Decompression { + /// The compression method that failed + method: Method, + /// Additional information + msg: String, + }, + /// Could not read as a zip because size could not be determined #[error("size must be known to open zip file")] UnknownSize, } impl Error { - #[allow(unused)] - pub(crate) fn method_not_supported(method: Method) -> Self { + /// Create a new error indicating that the given method is not supported. + pub fn method_not_supported(method: Method) -> Self { Self::Unsupported(UnsupportedError::MethodNotSupported(method)) } - #[allow(unused)] - pub(crate) fn method_not_enabled(method: Method) -> Self { + /// Create a new error indicating that the given method is not enabled. + pub fn method_not_enabled(method: Method) -> Self { Self::Unsupported(UnsupportedError::MethodNotEnabled(method)) } } +/// Some part of the zip format is not supported by this crate. #[derive(Debug, thiserror::Error)] pub enum UnsupportedError { + /// The compression method is not supported. #[error("compression method not supported: {0:?}")] - MethodNotSupported(crate::format::Method), + MethodNotSupported(Method), + /// The compression method is supported, but not enabled in this build. #[error("compression method supported, but not enabled in this build: {0:?}")] - MethodNotEnabled(crate::format::Method), + MethodNotEnabled(Method), + /// The zip file uses a version of LZMA that is not supported. #[error("only LZMA2.0 is supported, found LZMA{minor}.{major}")] - LzmaVersionUnsupported { minor: u8, major: u8 }, + LzmaVersionUnsupported { + /// major version read from LZMA properties header, cf. appnote 5.8.8 + major: u8, + /// minor version read from LZMA properties header, cf. appnote 5.8.8 + minor: u8, + }, + + /// The LZMA properties header is not the expected size. #[error("LZMA properties header wrong size: expected {expected} bytes, got {actual} bytes")] - LzmaPropertiesHeaderWrongSize { expected: u16, actual: u16 }, + LzmaPropertiesHeaderWrongSize { + /// expected size in bytes + expected: u16, + /// actual size in bytes, read from a u16, cf. appnote 5.8.8 + actual: u16, + }, } /// Specific zip format errors, mostly due to invalid zip archives but that could also stem from @@ -80,7 +107,12 @@ pub enum FormatError { /// a certain number of files, but we weren't able to read the same number of central directory /// headers. #[error("invalid central record: expected to read {expected} files, got {actual}")] - InvalidCentralRecord { expected: u16, actual: u16 }, + InvalidCentralRecord { + /// expected number of files + expected: u16, + /// actual number of files + actual: u16, + }, /// An extra field (that we support) was not decoded correctly. /// @@ -94,7 +126,9 @@ pub enum FormatError { /// claimed_records_count * minimum_entry_size, we know it's not a valid zip file. #[error("impossible number of files: claims to have {claimed_records_count}, but zip size is {zip_size}")] ImpossibleNumberOfFiles { + /// number of files claimed in the end of central directory record claimed_records_count: u64, + /// total size of the zip file zip_size: u64, }, @@ -108,14 +142,21 @@ pub enum FormatError { /// The uncompressed size didn't match #[error("uncompressed size didn't match: expected {expected}, got {actual}")] - WrongSize { expected: u64, actual: u64 }, + WrongSize { + /// expected size in bytes (from the local header, data descriptor, etc.) + expected: u64, + /// actual size in bytes (from decompressing the entry) + actual: u64, + }, /// The CRC-32 checksum didn't match. #[error("checksum didn't match: expected {expected:x?}, got {actual:x?}")] - WrongChecksum { expected: u32, actual: u32 }, - - #[error("lzma properties larger than max")] - LzmaPropertiesLargerThanMax, + WrongChecksum { + /// expected checksum (from the data descriptor, etc.) + expected: u32, + /// actual checksum (from decompressing the entry) + actual: u32, + }, } impl From for std::io::Error { diff --git a/src/format/archive.rs b/rc-zip/src/format/archive.rs similarity index 100% rename from src/format/archive.rs rename to rc-zip/src/format/archive.rs diff --git a/src/format/date_time.rs b/rc-zip/src/format/date_time.rs similarity index 100% rename from src/format/date_time.rs rename to rc-zip/src/format/date_time.rs diff --git a/src/format/directory_header.rs b/rc-zip/src/format/directory_header.rs similarity index 100% rename from src/format/directory_header.rs rename to rc-zip/src/format/directory_header.rs diff --git a/src/format/eocd.rs b/rc-zip/src/format/eocd.rs similarity index 100% rename from src/format/eocd.rs rename to rc-zip/src/format/eocd.rs diff --git a/src/format/extra_field.rs b/rc-zip/src/format/extra_field.rs similarity index 100% rename from src/format/extra_field.rs rename to rc-zip/src/format/extra_field.rs diff --git a/src/format/local.rs b/rc-zip/src/format/local.rs similarity index 60% rename from src/format/local.rs rename to rc-zip/src/format/local.rs index 553b3e1..2c43c43 100644 --- a/src/format/local.rs +++ b/rc-zip/src/format/local.rs @@ -1,7 +1,8 @@ -use crate::format::*; +use crate::{format::*, Error, UnsupportedError}; use winnow::{ - binary::{le_u16, le_u32, le_u64}, + binary::{le_u16, le_u32, le_u64, le_u8}, combinator::opt, + error::{ContextError, ErrMode, ErrorKind, FromExternalError}, seq, token::tag, PResult, Parser, Partial, @@ -15,7 +16,7 @@ pub struct LocalFileHeaderRecord { /// general purpose bit flag pub flags: u16, /// compression method - pub method: u16, + pub method: Method, /// last mod file datetime pub modified: MsdosTimestamp, /// crc-32 @@ -28,6 +29,16 @@ pub struct LocalFileHeaderRecord { pub name: ZipString, // extra field pub extra: ZipBytes, + + // method-specific fields + pub method_specific: MethodSpecific, +} + +#[derive(Debug)] +/// Method-specific properties following the local file header +pub enum MethodSpecific { + None, + Lzma(LzmaProperties), } impl LocalFileHeaderRecord { @@ -38,7 +49,7 @@ impl LocalFileHeaderRecord { let reader_version = Version::parser.parse_next(i)?; let flags = le_u16.parse_next(i)?; - let method = le_u16.parse_next(i)?; + let method = le_u16.parse_next(i).map(Method::from)?; let modified = MsdosTimestamp::parser.parse_next(i)?; let crc32 = le_u32.parse_next(i)?; let compressed_size = le_u32.parse_next(i)?; @@ -50,6 +61,21 @@ impl LocalFileHeaderRecord { let name = ZipString::parser(name_len).parse_next(i)?; let extra = ZipBytes::parser(extra_len).parse_next(i)?; + let method_specific = match method { + Method::Lzma => { + let lzma_properties = LzmaProperties::parser.parse_next(i)?; + if let Err(e) = lzma_properties.error_if_unsupported() { + return Err(ErrMode::Cut(ContextError::from_external_error( + i, + ErrorKind::Verify, + e, + ))); + } + MethodSpecific::Lzma(lzma_properties) + } + _ => MethodSpecific::None, + }; + Ok(Self { reader_version, flags, @@ -60,6 +86,7 @@ impl LocalFileHeaderRecord { uncompressed_size, name, extra, + method_specific, }) } @@ -114,3 +141,48 @@ impl DataDescriptorRecord { } } } + +/// 5.8.5 LZMA Properties header +#[derive(Debug)] +pub struct LzmaProperties { + /// major version + pub major: u8, + /// minor version + pub minor: u8, + /// properties size + pub properties_size: u16, +} + +impl LzmaProperties { + pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + seq! {Self { + major: le_u8, + minor: le_u8, + properties_size: le_u16, + }} + .parse_next(i) + } + + pub fn error_if_unsupported(&self) -> Result<(), Error> { + if (self.major, self.minor) != (2, 0) { + return Err(Error::Unsupported( + UnsupportedError::LzmaVersionUnsupported { + minor: self.minor, + major: self.major, + }, + )); + } + + const LZMA_PROPERTIES_SIZE: u16 = 5; + if self.properties_size != LZMA_PROPERTIES_SIZE { + return Err(Error::Unsupported( + UnsupportedError::LzmaPropertiesHeaderWrongSize { + expected: 5, + actual: self.properties_size, + }, + )); + } + + Ok(()) + } +} diff --git a/src/format/mod.rs b/rc-zip/src/format/mod.rs similarity index 54% rename from src/format/mod.rs rename to rc-zip/src/format/mod.rs index 709634f..541edc8 100644 --- a/src/format/mod.rs +++ b/rc-zip/src/format/mod.rs @@ -1,3 +1,10 @@ +//! Contain winnow parsers for most elements that make up a ZIP file, like +//! the end-of-central-directory record, local file headers, and central +//! directory headers. +//! +//! Everything in there is based off of the appnote, which you can find in the +//! source repository. + pub use crate::encoding::Encoding; mod archive; diff --git a/src/format/mode.rs b/rc-zip/src/format/mode.rs similarity index 100% rename from src/format/mode.rs rename to rc-zip/src/format/mode.rs diff --git a/src/format/raw.rs b/rc-zip/src/format/raw.rs similarity index 100% rename from src/format/raw.rs rename to rc-zip/src/format/raw.rs diff --git a/src/format/version.rs b/rc-zip/src/format/version.rs similarity index 100% rename from src/format/version.rs rename to rc-zip/src/format/version.rs diff --git a/src/reader/archive_reader.rs b/rc-zip/src/fsm/archive.rs similarity index 61% rename from src/reader/archive_reader.rs rename to rc-zip/src/fsm/archive.rs index 0108142..6641d59 100644 --- a/src/reader/archive_reader.rs +++ b/rc-zip/src/fsm/archive.rs @@ -1,6 +1,13 @@ -use crate::{encoding::Encoding, error::*, format::*, reader::buffer::*, transition}; +use super::FsmResult; +use crate::{ + encoding::Encoding, + error::{Error, FormatError}, + parse::{ + Archive, DirectoryHeader, EndOfCentralDirectory, EndOfCentralDirectory64Locator, + EndOfCentralDirectory64Record, EndOfCentralDirectoryRecord, Located, StoredEntry, + }, +}; -use std::io::Read; use tracing::trace; use winnow::{ error::ErrMode, @@ -8,78 +15,70 @@ use winnow::{ Parser, Partial, }; -/// ArchiveReader parses a valid zip archive into an [Archive][]. In particular, this struct finds +/// [ArchiveFsm] parses a valid zip archive into an [Archive]. In particular, this struct finds /// an end of central directory record, parses the entire central directory, detects text encoding, /// and normalizes metadata. -pub struct ArchiveReader { - // Size of the entire zip file +/// +/// The loop is as follows: +/// +/// * Call [Self::wants_read] to check if more data is needed. +/// * If it returns `Some(offset)`, read the file at that offset +/// into [Self::space] and then call [Self::fill] with +/// the number of bytes read. +/// * Call [Self::process] to process the data. +/// * If it returns [FsmResult::Continue], loop back to the first step. +/// +/// Look at the integration tests or +/// [rc-zip-sync](https://crates.io/crates/rc-zip-sync) for concrete examples. +pub struct ArchiveFsm { + /// Size of the entire zip file size: u64, - state: ArchiveReaderState, -} -pub enum ArchiveReaderResult { - /// Indicates that [ArchiveReader][] has work left, and the loop should continue. - Continue, - /// Indicates that [ArchiveReader][] is done reading the central directory, - /// contains an [Archive][]. Calling any method after [process()](ArchiveReader::process()) has returned - /// `Done` will panic. - Done(Archive), -} + /// Current stage: finding the eocd, reading the eocd, reading the eocd64 + /// locator, reading the eocd64, or reading the central directory + state: State, -enum ArchiveReaderState { - /// Used while transitioning because ownership rules are tough. - Transitioning, + /// Buffer for reading data from the file + buffer: Buffer, +} +#[derive(Default)] +enum State { /// Finding and reading the end of central directory record - ReadEocd { buffer: Buffer, haystack_size: u64 }, + ReadEocd { + /// size of the haystack in which we're looking for the end of central + /// directory record. + /// this may be less than 65KiB if the file is smaller than that. + haystack_size: u64, + }, /// Reading the zip64 end of central directory record. ReadEocd64Locator { - buffer: Buffer, eocdr: Located, }, /// Reading the zip64 end of central directory record. ReadEocd64 { - buffer: Buffer, eocdr64_offset: u64, eocdr: Located, }, /// Reading all headers from the central directory ReadCentralDirectory { - buffer: Buffer, eocd: EndOfCentralDirectory, directory_headers: Vec, }, - /// Done! - Done, -} - -impl ArchiveReaderState { - fn buffer_as_mut(&mut self) -> Option<&mut Buffer> { - use ArchiveReaderState as S; - match self { - S::ReadEocd { ref mut buffer, .. } => Some(buffer), - S::ReadEocd64Locator { ref mut buffer, .. } => Some(buffer), - S::ReadEocd64 { ref mut buffer, .. } => Some(buffer), - S::ReadCentralDirectory { ref mut buffer, .. } => Some(buffer), - _ => None, - } - } + #[default] + Transitioning, } -impl ArchiveReader { +impl ArchiveFsm { /// This should be > 65KiB, because the section at the end of the /// file that we check for end of central directory record is 65KiB. const DEFAULT_BUFFER_SIZE: usize = 256 * 1024; /// Create a new archive reader with a specified file size. - /// - /// Actual reading of the file is performed by calling - /// [wants_read()](ArchiveReader::wants_read()), [read()](ArchiveReader::read()) and - /// [process()](ArchiveReader::process()) in a loop. pub fn new(size: u64) -> Self { let haystack_size: u64 = 65 * 1024; let haystack_size = if size < haystack_size { @@ -90,88 +89,54 @@ impl ArchiveReader { Self { size, - state: ArchiveReaderState::ReadEocd { - buffer: Buffer::with_capacity(Self::DEFAULT_BUFFER_SIZE), - haystack_size, - }, + buffer: Buffer::with_capacity(Self::DEFAULT_BUFFER_SIZE), + state: State::ReadEocd { haystack_size }, } } - /// Returns whether or not this reader needs more data to continue. - /// - /// Returns `Some(offset)` if this reader needs to read some data from `offset`. - /// In this case, [read()](ArchiveReader::read()) should be called with a [Read] - /// at the correct offset. - /// - /// Returns `None` if the reader does not need data and [process()](ArchiveReader::process()) - /// can be called directly. + /// If this returns `Some(offset)`, the caller should read data from + /// `offset` into [Self::space] — without forgetting to call + /// [Self::fill] with the number of bytes written. pub fn wants_read(&self) -> Option { - use ArchiveReaderState as S; + use State as S; match self.state { - S::ReadEocd { - ref buffer, - haystack_size, - } => Some(buffer.read_offset(self.size - haystack_size)), - S::ReadEocd64Locator { - ref buffer, - ref eocdr, - } => { + S::ReadEocd { haystack_size } => { + Some(self.buffer.read_offset(self.size - haystack_size)) + } + S::ReadEocd64Locator { ref eocdr } => { let length = EndOfCentralDirectory64Locator::LENGTH as u64; - Some(buffer.read_offset(eocdr.offset - length)) + Some(self.buffer.read_offset(eocdr.offset - length)) + } + S::ReadEocd64 { eocdr64_offset, .. } => Some(self.buffer.read_offset(eocdr64_offset)), + S::ReadCentralDirectory { ref eocd, .. } => { + Some(self.buffer.read_offset(eocd.directory_offset())) } - S::ReadEocd64 { - ref buffer, - eocdr64_offset, - .. - } => Some(buffer.read_offset(eocdr64_offset)), - S::ReadCentralDirectory { - ref buffer, - ref eocd, - .. - } => Some(buffer.read_offset(eocd.directory_offset())), - S::Done { .. } => panic!("Called wants_read() on ArchiveReader in Done state"), S::Transitioning => unreachable!(), } } - /// Reads some data from `rd` into the reader's internal buffer. - /// - /// Any I/O errors will be returned. - /// - /// If successful, this returns the number of bytes read. On success, - /// [process()](ArchiveReader::process()) should be called next. - pub fn read(&mut self, rd: &mut dyn Read) -> Result { - if let Some(buffer) = self.state.buffer_as_mut() { - buffer.read(rd) - } else { - Ok(0) - } - } - /// Process buffered data /// - /// Errors returned from process() are caused by invalid zip archives, + /// Errors returned from this function are caused by invalid zip archives, /// unsupported format quirks, or implementation bugs - never I/O errors. /// - /// A result of [ArchiveReaderResult::Continue] indicates one should loop again, - /// starting with [wants_read()](ArchiveReader::wants_read()). + /// A result of [FsmResult::Continue] gives back ownership of the state + /// machine and indicates the I/O loop should continue, starting with + /// [Self::wants_read]. /// - /// A result of [ArchiveReaderResult::Done] contains the [Archive], and indicates that no - /// method should ever be called again on this reader. - pub fn process(&mut self) -> Result { - use ArchiveReaderResult as R; - use ArchiveReaderState as S; + /// A result of [FsmResult::Done] consumes the state machine and returns + /// a fully-parsed [Archive]. + pub fn process(mut self) -> Result, Error> { + use State as S; match self.state { - S::ReadEocd { - ref mut buffer, - haystack_size, - } => { - if buffer.read_bytes() < haystack_size { - return Ok(R::Continue); + S::ReadEocd { haystack_size } => { + if self.buffer.read_bytes() < haystack_size { + // read the entire haystack before we can continue + return Ok(FsmResult::Continue(self)); } match { - let haystack = &buffer.data()[..haystack_size as usize]; + let haystack = &self.buffer.data()[..haystack_size as usize]; EndOfCentralDirectoryRecord::find_in_block(haystack) } { None => Err(FormatError::DirectoryEndSignatureNotFound.into()), @@ -181,7 +146,7 @@ impl ArchiveReader { size = self.size, "ReadEocd | found end of central directory record" ); - buffer.reset(); + self.buffer.reset(); eocdr.offset += self.size - haystack_size; if eocdr.offset < EndOfCentralDirectory64Locator::LENGTH as u64 { @@ -191,70 +156,69 @@ impl ArchiveReader { eocd64locator_length = EndOfCentralDirectory64Locator::LENGTH, "no room for an EOCD64 locator, definitely not a zip64 file" ); - transition!(self.state => (S::ReadEocd { mut buffer, .. }) { - buffer.reset(); + transition!(self.state => (S::ReadEocd { .. }) { S::ReadCentralDirectory { - buffer, eocd: EndOfCentralDirectory::new(self.size, eocdr, None)?, directory_headers: vec![], } }); - Ok(R::Continue) + Ok(FsmResult::Continue(self)) } else { trace!("ReadEocd | transition to ReadEocd64Locator"); - transition!(self.state => (S::ReadEocd { mut buffer, .. }) { - buffer.reset(); - S::ReadEocd64Locator { buffer, eocdr } + self.buffer.reset(); + transition!(self.state => (S::ReadEocd { .. }) { + S::ReadEocd64Locator { eocdr } }); - Ok(R::Continue) + Ok(FsmResult::Continue(self)) } } } } - S::ReadEocd64Locator { ref mut buffer, .. } => { - let input = Partial::new(buffer.data()); + S::ReadEocd64Locator { .. } => { + let input = Partial::new(self.buffer.data()); match EndOfCentralDirectory64Locator::parser.parse_peek(input) { Err(ErrMode::Incomplete(_)) => { // need more data - Ok(R::Continue) + Ok(FsmResult::Continue(self)) } Err(ErrMode::Backtrack(_)) | Err(ErrMode::Cut(_)) => { // we don't have a zip64 end of central directory locator - that's ok! trace!("ReadEocd64Locator | no zip64 end of central directory locator"); - trace!("ReadEocd64Locator | data we got: {:02x?}", buffer.data()); - transition!(self.state => (S::ReadEocd64Locator { mut buffer, eocdr }) { - buffer.reset(); + trace!( + "ReadEocd64Locator | data we got: {:02x?}", + self.buffer.data() + ); + self.buffer.reset(); + transition!(self.state => (S::ReadEocd64Locator { eocdr }) { S::ReadCentralDirectory { - buffer, eocd: EndOfCentralDirectory::new(self.size, eocdr, None)?, directory_headers: vec![], } }); - Ok(R::Continue) + Ok(FsmResult::Continue(self)) } Ok((_, locator)) => { trace!( ?locator, "ReadEocd64Locator | found zip64 end of central directory locator" ); - transition!(self.state => (S::ReadEocd64Locator { mut buffer, eocdr }) { - buffer.reset(); + self.buffer.reset(); + transition!(self.state => (S::ReadEocd64Locator { eocdr }) { S::ReadEocd64 { - buffer, eocdr64_offset: locator.directory_offset, eocdr, } }); - Ok(R::Continue) + Ok(FsmResult::Continue(self)) } } } - S::ReadEocd64 { ref mut buffer, .. } => { - let input = Partial::new(buffer.data()); + S::ReadEocd64 { .. } => { + let input = Partial::new(self.buffer.data()); match EndOfCentralDirectory64Record::parser.parse_peek(input) { Err(ErrMode::Incomplete(_)) => { // need more data - Ok(R::Continue) + Ok(FsmResult::Continue(self)) } Err(ErrMode::Backtrack(_)) | Err(ErrMode::Cut(_)) => { // at this point, we really expected to have a zip64 end @@ -263,10 +227,9 @@ impl ArchiveReader { Err(FormatError::Directory64EndRecordInvalid.into()) } Ok((_, eocdr64)) => { - transition!(self.state => (S::ReadEocd64 { mut buffer, eocdr, eocdr64_offset }) { - buffer.reset(); + self.buffer.reset(); + transition!(self.state => (S::ReadEocd64 { eocdr, eocdr64_offset }) { S::ReadCentralDirectory { - buffer, eocd: EndOfCentralDirectory::new(self.size, eocdr, Some(Located { offset: eocdr64_offset, inner: eocdr64 @@ -274,22 +237,21 @@ impl ArchiveReader { directory_headers: vec![], } }); - Ok(R::Continue) + Ok(FsmResult::Continue(self)) } } } S::ReadCentralDirectory { - ref mut buffer, ref eocd, ref mut directory_headers, } => { trace!( "ReadCentralDirectory | process(), available: {}", - buffer.available_data() + self.buffer.available_data() ); - let mut input = Partial::new(buffer.data()); + let mut input = Partial::new(self.buffer.data()); trace!( - initial_offset = input.as_bytes().offset_from(&buffer.data()), + initial_offset = input.as_bytes().offset_from(&self.buffer.data()), initial_len = input.len(), "initial offset & len" ); @@ -298,7 +260,7 @@ impl ArchiveReader { Ok(dh) => { trace!( input_empty_now = input.is_empty(), - offset = input.as_bytes().offset_from(&buffer.data()), + offset = input.as_bytes().offset_from(&self.buffer.data()), len = input.len(), "ReadCentralDirectory | parsed directory header" ); @@ -385,8 +347,7 @@ impl ArchiveReader { comment = Some(encoding.decode(&eocd.comment().0)?); } - self.state = S::Done; - return Ok(R::Done(Archive { + return Ok(FsmResult::Done(Archive { size: self.size, comment, entries, @@ -404,15 +365,125 @@ impl ArchiveReader { } } } - let consumed = input.as_bytes().offset_from(&buffer.data()); + let consumed = input.as_bytes().offset_from(&self.buffer.data()); tracing::trace!(%consumed, "ReadCentralDirectory total consumed"); - buffer.consume(consumed); + self.buffer.consume(consumed); // need more data - Ok(R::Continue) + Ok(FsmResult::Continue(self)) } - S::Done { .. } => panic!("Called process() on ArchiveReader in Done state"), S::Transitioning => unreachable!(), } } + + /// Returns a mutable slice with all the available space to write to. + /// + /// After writing to this, call [Self::fill] with the number of bytes written. + #[inline] + pub fn space(&mut self) -> &mut [u8] { + if self.buffer.available_space() == 0 { + self.buffer.shift(); + } + self.buffer.space() + } + + /// After having written data to [Self::space], call this to indicate how + /// many bytes were written. + #[inline] + pub fn fill(&mut self, count: usize) -> usize { + self.buffer.fill(count) + } +} + +/// A wrapper around [oval::Buffer] that keeps track of how many bytes we've read since +/// initialization or the last reset. +pub(crate) struct Buffer { + pub(crate) buffer: oval::Buffer, + pub(crate) read_bytes: u64, +} + +impl Buffer { + /// creates a new buffer with the specified capacity + pub(crate) fn with_capacity(size: usize) -> Self { + Self { + buffer: oval::Buffer::with_capacity(size), + read_bytes: 0, + } + } + + /// resets the buffer (so that data() returns an empty slice, + /// and space() returns the full capacity), along with th e + /// read bytes counter. + pub(crate) fn reset(&mut self) { + self.read_bytes = 0; + self.buffer.reset(); + } + + /// returns the number of read bytes since the last reset + #[inline] + pub(crate) fn read_bytes(&self) -> u64 { + self.read_bytes + } + + /// returns a slice with all the available data + #[inline] + pub(crate) fn data(&self) -> &[u8] { + self.buffer.data() + } + + /// returns how much data can be read from the buffer + #[inline] + pub(crate) fn available_data(&self) -> usize { + self.buffer.available_data() + } + + /// returns how much free space is available to write to + #[inline] + pub fn available_space(&self) -> usize { + self.buffer.available_space() + } + + /// returns a mutable slice with all the available space to + /// write to + #[inline] + pub(crate) fn space(&mut self) -> &mut [u8] { + self.buffer.space() + } + + /// moves the data at the beginning of the buffer + /// + /// if the position was more than 0, it is now 0 + #[inline] + pub fn shift(&mut self) { + self.buffer.shift() + } + + /// after having written data to the buffer, use this function + /// to indicate how many bytes were written + /// + /// if there is not enough available space, this function can call + /// `shift()` to move the remaining data to the beginning of the + /// buffer + #[inline] + pub(crate) fn fill(&mut self, count: usize) -> usize { + let n = self.buffer.fill(count); + self.read_bytes += n as u64; + n + } + + /// advances the position tracker + /// + /// if the position gets past the buffer's half, + /// this will call `shift()` to move the remaining data + /// to the beginning of the buffer + #[inline] + pub(crate) fn consume(&mut self, size: usize) { + self.buffer.consume(size); + } + + /// computes an absolute offset, given an offset relative + /// to the current read position + pub(crate) fn read_offset(&self, offset: u64) -> u64 { + self.read_bytes + offset + } } diff --git a/rc-zip/src/fsm/entry/bzip2_dec.rs b/rc-zip/src/fsm/entry/bzip2_dec.rs new file mode 100644 index 0000000..6abad7c --- /dev/null +++ b/rc-zip/src/fsm/entry/bzip2_dec.rs @@ -0,0 +1,67 @@ +use crate::{error::Error, parse::Method}; + +use super::{DecompressOutcome, Decompressor, HasMoreInput}; + +pub(crate) struct Bzip2Dec { + inner: bzip2::Decompress, + eof: bool, +} + +impl Default for Bzip2Dec { + fn default() -> Self { + // don't use the 'small' alternative decompression algorithm + let small = false; + Self { + inner: bzip2::Decompress::new(small), + eof: false, + } + } +} + +impl Decompressor for Bzip2Dec { + fn decompress( + &mut self, + in_buf: &[u8], + out: &mut [u8], + _has_more_input: HasMoreInput, + ) -> Result { + tracing::trace!( + in_buf_len = in_buf.len(), + out_len = out.len(), + total_in = self.inner.total_in(), + total_out = self.inner.total_out(), + "Bzip2Dec::decompress", + ); + + if self.eof { + return Ok(DecompressOutcome { + bytes_written: 0, + bytes_read: 0, + }); + } + + let before_in = self.inner.total_in(); + let before_out = self.inner.total_out(); + + match self.inner.decompress(in_buf, out) { + Ok(status) => { + tracing::trace!("status: {:?}", status); + if status == bzip2::Status::StreamEnd { + self.eof = true; + } + } + Err(e) => { + return Err(Error::Decompression { + method: Method::Bzip2, + msg: e.to_string(), + }) + } + }; + + let outcome = DecompressOutcome { + bytes_written: (self.inner.total_out() - before_out) as usize, + bytes_read: (self.inner.total_in() - before_in) as usize, + }; + Ok(outcome) + } +} diff --git a/rc-zip/src/fsm/entry/deflate64_dec.rs b/rc-zip/src/fsm/entry/deflate64_dec.rs new file mode 100644 index 0000000..0a2cfec --- /dev/null +++ b/rc-zip/src/fsm/entry/deflate64_dec.rs @@ -0,0 +1,47 @@ +use deflate64::InflaterManaged; + +use crate::{error::Error, parse::Method}; + +use super::{DecompressOutcome, Decompressor, HasMoreInput}; + +pub(crate) struct Deflate64Dec { + inflater: InflaterManaged, +} + +impl Default for Deflate64Dec { + fn default() -> Self { + Self { + inflater: InflaterManaged::new(), + } + } +} + +impl Decompressor for Deflate64Dec { + fn decompress( + &mut self, + in_buf: &[u8], + out: &mut [u8], + _has_more_input: HasMoreInput, + ) -> Result { + tracing::trace!( + in_buf_len = in_buf.len(), + out_len = out.len(), + remain_in_internal_buffer = self.inflater.available_output(), + "decompress", + ); + + let res = self.inflater.inflate(in_buf, out); + if res.data_error { + return Err(Error::Decompression { + method: Method::Deflate64, + msg: "data error".into(), + }); + } + + let outcome = DecompressOutcome { + bytes_read: res.bytes_consumed, + bytes_written: res.bytes_written, + }; + Ok(outcome) + } +} diff --git a/rc-zip/src/fsm/entry/deflate_dec.rs b/rc-zip/src/fsm/entry/deflate_dec.rs new file mode 100644 index 0000000..db405b6 --- /dev/null +++ b/rc-zip/src/fsm/entry/deflate_dec.rs @@ -0,0 +1,149 @@ +use std::cmp; + +use miniz_oxide::inflate::{ + core::{ + decompress, + inflate_flags::{TINFL_FLAG_HAS_MORE_INPUT, TINFL_FLAG_IGNORE_ADLER32}, + DecompressorOxide, + }, + TINFLStatus, +}; +use tracing::trace; + +use crate::{error::Error, fsm::entry::HasMoreInput, parse::Method}; + +use super::{DecompressOutcome, Decompressor}; + +pub(crate) struct DeflateDec { + /// 64 KiB circular internal buffer. From miniz_oxide docs: + /// + /// > The decompression function normally needs access to 32KiB of the + /// > previously decompressed data (or to the beginning of the decompressed + /// > data if less than 32KiB has been decompressed.) + internal_buffer: Vec, + + /// The position in the internal buffer where we should start writing the + /// next decompressed data. Note that the buffer is circular, so we need to + /// wrap around when we reach the end. + out_pos: usize, + + /// If this is non-zero, there's data *after* [Self::out_pos] we haven't + /// copied to the caller's output buffer yet. As we copy it, we'll decrease + /// this value and increase [Self::out_pos]. When it reaches zero, we'll + /// need to call miniz_oxide again to get more data. + remain_in_internal_buffer: usize, + + /// The miniz_oxide decompressor state + state: DecompressorOxide, +} + +impl Default for DeflateDec { + fn default() -> Self { + Self { + internal_buffer: vec![0u8; Self::INTERNAL_BUFFER_LENGTH], + out_pos: 0, + state: DecompressorOxide::new(), + remain_in_internal_buffer: 0, + } + } +} + +impl Decompressor for DeflateDec { + fn decompress( + &mut self, + in_buf: &[u8], + out: &mut [u8], + has_more_input: HasMoreInput, + ) -> Result { + tracing::trace!( + in_buf_len = in_buf.len(), + out_len = out.len(), + remain_in_internal_buffer = self.remain_in_internal_buffer, + out_pos = self.out_pos, + "decompress", + ); + + let mut outcome: DecompressOutcome = Default::default(); + self.copy_to_out(out, &mut outcome); + if outcome.bytes_written > 0 { + tracing::trace!( + "returning {} bytes from internal buffer", + outcome.bytes_written + ); + return Ok(outcome); + } + + // no output bytes, let's call miniz_oxide + + let mut flags = TINFL_FLAG_IGNORE_ADLER32; + if matches!(has_more_input, HasMoreInput::Yes) { + flags |= TINFL_FLAG_HAS_MORE_INPUT; + } + + let (status, bytes_read, bytes_written) = decompress( + &mut self.state, + in_buf, + &mut self.internal_buffer, + self.out_pos, + flags, + ); + outcome.bytes_read += bytes_read; + self.remain_in_internal_buffer += bytes_written; + + match status { + TINFLStatus::FailedCannotMakeProgress => { + return Err(Error::Decompression { method: Method::Deflate, msg: "Failed to make progress: more input data was expected, but the caller indicated there was no more data, so the input stream is likely truncated".to_string() }) + } + TINFLStatus::BadParam => { + return Err(Error::Decompression { method: Method::Deflate, msg: "The output buffer is an invalid size; consider the flags parameter".to_string() }) + } + TINFLStatus::Adler32Mismatch => { + return Err(Error::Decompression { method: Method::Deflate, msg: "The decompression went fine, but the adler32 checksum did not match the one provided in the header.".to_string() }) + } + TINFLStatus::Failed => { + return Err(Error::Decompression { method: Method::Deflate, msg: "Failed to decompress due to invalid data.".to_string() }) + }, + TINFLStatus::Done => { + // eventually this'll return bytes_written == 0 + }, + TINFLStatus::NeedsMoreInput => { + // that's okay, we'll get more input next time + }, + TINFLStatus::HasMoreOutput => { + // that's okay, as long as we return bytes_written > 0 + // the caller will keep calling + }, + } + + self.copy_to_out(out, &mut outcome); + Ok(outcome) + } +} + +impl DeflateDec { + const INTERNAL_BUFFER_LENGTH: usize = 64 * 1024; + + fn copy_to_out(&mut self, mut out: &mut [u8], outcome: &mut DecompressOutcome) { + // as long as there's room in out_buf and we have remaining data in the + // internal buffer, copy from internal_buffer wrapping as needed, + // decreasing self.remain_in_internal_buffer and increasing self.out_pos + // and outcome.bytes_written + while !out.is_empty() && self.remain_in_internal_buffer > 0 { + let copy_len = cmp::min(self.remain_in_internal_buffer, out.len()); + // take wrapping into account + let copy_len = cmp::min(copy_len, self.internal_buffer.len() - self.out_pos); + trace!("copying {} bytes from internal buffer to out_buf", copy_len); + + out[..copy_len].copy_from_slice(&self.internal_buffer[self.out_pos..][..copy_len]); + self.out_pos += copy_len; + outcome.bytes_written += copy_len; + self.remain_in_internal_buffer -= copy_len; + out = &mut out[copy_len..]; + + // if we've reached the end of the buffer, wrap around + if self.out_pos == self.internal_buffer.len() { + self.out_pos = 0; + } + } + } +} diff --git a/rc-zip/src/fsm/entry/lzma_dec.rs b/rc-zip/src/fsm/entry/lzma_dec.rs new file mode 100644 index 0000000..0b98890 --- /dev/null +++ b/rc-zip/src/fsm/entry/lzma_dec.rs @@ -0,0 +1,149 @@ +use std::{cmp, io::Write}; + +use crate::{error::Error, parse::Method}; + +use super::{DecompressOutcome, Decompressor, HasMoreInput}; + +use lzma_rs::decompress::{Options, Stream, UnpackedSize}; +use tracing::trace; + +#[derive(Default)] +enum State { + Writing(Box>>), + Draining(Vec), + + #[default] + Transition, +} + +pub(crate) struct LzmaDec { + state: State, +} + +impl LzmaDec { + pub fn new(uncompressed_size: u64) -> Self { + let stream = Stream::new_with_options( + &(Options { + unpacked_size: UnpackedSize::UseProvided(Some(uncompressed_size)), + allow_incomplete: false, + memlimit: Some(128 * 1024 * 1024), + }), + vec![], + ); + + Self { + state: State::Writing(Box::new(stream)), + } + } +} + +impl Decompressor for LzmaDec { + fn decompress( + &mut self, + in_buf: &[u8], + out: &mut [u8], + has_more_input: HasMoreInput, + ) -> Result { + tracing::trace!( + in_buf_len = in_buf.len(), + out_len = out.len(), + remain_in_internal_buffer = self.internal_buf_mut().len(), + "decompress", + ); + + let mut outcome: DecompressOutcome = Default::default(); + + self.copy_to_out(out, &mut outcome); + if outcome.bytes_written > 0 { + trace!( + "still draining internal buffer, just copied {} bytes", + outcome.bytes_written + ); + return Ok(outcome); + } + + match &mut self.state { + State::Writing(stream) => { + let n = stream.write(in_buf).map_err(dec_err)?; + trace!( + "wrote {} bytes to decompressor (of {} available)", + n, + in_buf.len() + ); + outcome.bytes_read = n; + + // if we haven't written all the input, and we haven't gotten + // any output, then we need to keep going + if n != 0 && n < in_buf.len() && self.internal_buf_mut().is_empty() { + // note: the n != 0 here is because apparently there can be a 10-byte + // trailer after LZMA compressed data? and the decoder will _refuse_ + // to let us write them, so when we have just these 10 bytes left, + // it's good to just let the decoder finish up. + trace!("didn't write all output AND no output yet, so keep going"); + return self.decompress(&in_buf[n..], out, has_more_input); + } + + match has_more_input { + HasMoreInput::Yes => { + // keep going + trace!("more input to come"); + } + HasMoreInput::No => { + trace!("no more input to come"); + match std::mem::take(&mut self.state) { + State::Writing(stream) => { + trace!("finishing..."); + self.state = State::Draining(stream.finish().map_err(dec_err)?); + } + _ => unreachable!(), + } + } + } + } + State::Draining(_) => { + // keep going + } + State::Transition => unreachable!(), + } + + self.copy_to_out(out, &mut outcome); + trace!("decompressor gave us {} bytes", outcome.bytes_written); + Ok(outcome) + } +} + +fn dec_err(e: impl std::fmt::Display) -> Error { + Error::Decompression { + method: Method::Lzma, + msg: e.to_string(), + } +} + +impl LzmaDec { + #[inline(always)] + fn internal_buf_mut(&mut self) -> &mut Vec { + match &mut self.state { + State::Writing(stream) => stream.get_output_mut().unwrap(), + State::Draining(buf) => buf, + State::Transition => unreachable!(), + } + } + + fn copy_to_out(&mut self, mut out: &mut [u8], outcome: &mut DecompressOutcome) { + let internal_buf = self.internal_buf_mut(); + + while !out.is_empty() && !internal_buf.is_empty() { + let to_copy = cmp::min(out.len(), internal_buf.len()); + trace!("copying {} bytes from internal buffer", to_copy); + out[..to_copy].copy_from_slice(&internal_buf[..to_copy]); + out = &mut out[to_copy..]; + + // rotate the internal buffer + internal_buf.rotate_left(to_copy); + // and shrink it + internal_buf.resize(internal_buf.len() - to_copy, 0); + + outcome.bytes_written += to_copy; + } + } +} diff --git a/rc-zip/src/fsm/entry/mod.rs b/rc-zip/src/fsm/entry/mod.rs new file mode 100644 index 0000000..db72965 --- /dev/null +++ b/rc-zip/src/fsm/entry/mod.rs @@ -0,0 +1,433 @@ +use std::cmp; + +use oval::Buffer; +use tracing::trace; +use winnow::{ + error::ErrMode, + stream::{AsBytes, Offset}, + Parser, Partial, +}; + +mod store_dec; + +#[cfg(feature = "deflate")] +mod deflate_dec; + +#[cfg(feature = "deflate64")] +mod deflate64_dec; + +#[cfg(feature = "bzip2")] +mod bzip2_dec; + +#[cfg(feature = "lzma")] +mod lzma_dec; + +#[cfg(feature = "zstd")] +mod zstd_dec; + +use crate::{ + error::{Error, FormatError, UnsupportedError}, + parse::{DataDescriptorRecord, LocalFileHeaderRecord, Method, StoredEntryInner}, +}; + +use super::FsmResult; + +struct EntryReadMetrics { + uncompressed_size: u64, + crc32: u32, +} + +#[derive(Default)] +enum State { + ReadLocalHeader, + + ReadData { + /// The local file header for this entry + header: LocalFileHeaderRecord, + + /// Amount of bytes we've fed to the decompressor + compressed_bytes: u64, + + /// Amount of bytes the decompressor has produced + uncompressed_bytes: u64, + + /// CRC32 hash of the decompressed data + hasher: crc32fast::Hasher, + + /// The decompression method we're using + decompressor: AnyDecompressor, + }, + + ReadDataDescriptor { + /// The local file header for this entry + header: LocalFileHeaderRecord, + + /// Size we've decompressed + crc32 hash we've computed + metrics: EntryReadMetrics, + }, + + Validate { + /// The local file header for this entry + header: LocalFileHeaderRecord, + + /// Size we've decompressed + crc32 hash we've computed + metrics: EntryReadMetrics, + + /// The data descriptor for this entry, if any + descriptor: Option, + }, + + #[default] + Transition, +} + +/// A state machine that can parse a zip entry +pub struct EntryFsm { + state: State, + entry: StoredEntryInner, + method: Method, + buffer: Buffer, + eof: bool, +} + +impl EntryFsm { + /// Create a new state machine for decompressing a zip entry + pub fn new(method: Method, entry: StoredEntryInner) -> Self { + Self { + state: State::ReadLocalHeader, + entry, + method, + buffer: Buffer::with_capacity(256 * 1024), + eof: false, + } + } + + /// If this returns true, the caller should read data from into + /// [Self::space] — without forgetting to call [Self::fill] with the number + /// of bytes written. + pub fn wants_read(&self) -> bool { + match self.state { + State::ReadLocalHeader => true, + State::ReadData { .. } => { + // we want to read if we have space + self.buffer.available_space() > 0 + } + State::ReadDataDescriptor { .. } => true, + State::Validate { .. } => false, + State::Transition => unreachable!(), + } + } + + /// Process the input and write the output to the given buffer + /// + /// This function will return `FsmResult::Continue` if it needs more input + /// to continue, or if it needs more space to write to. It will return + /// `FsmResult::Done` when all the input has been decompressed and all + /// the output has been written. + /// + /// Also, after writing all the output, process will read the data + /// descriptor (if any), and make sur the CRC32 hash and the uncompressed + /// size match the expected values. + pub fn process( + mut self, + out: &mut [u8], + ) -> Result, Error> { + tracing::trace!( + state = match &self.state { + State::ReadLocalHeader => "ReadLocalHeader", + State::ReadData { .. } => "ReadData", + State::ReadDataDescriptor { .. } => "ReadDataDescriptor", + State::Validate { .. } => "Validate", + State::Transition => "Transition", + }, + "process" + ); + + use State as S; + match &mut self.state { + S::ReadLocalHeader => { + let mut input = Partial::new(self.buffer.data()); + match LocalFileHeaderRecord::parser.parse_next(&mut input) { + Ok(header) => { + let consumed = input.as_bytes().offset_from(&self.buffer.data()); + tracing::trace!(local_file_header = ?header, consumed, "parsed local file header"); + self.buffer.consume(consumed); + self.state = S::ReadData { + header, + compressed_bytes: 0, + uncompressed_bytes: 0, + hasher: crc32fast::Hasher::new(), + decompressor: AnyDecompressor::new(self.method, &self.entry)?, + }; + self.process(out) + } + Err(ErrMode::Incomplete(_)) => { + Ok(FsmResult::Continue((self, Default::default()))) + } + Err(_e) => Err(Error::Format(FormatError::InvalidLocalHeader)), + } + } + S::ReadData { + compressed_bytes, + uncompressed_bytes, + hasher, + decompressor, + .. + } => { + let in_buf = self.buffer.data(); + + // don't feed the decompressor bytes beyond the entry's compressed size + let in_buf_max_len = cmp::min( + in_buf.len(), + self.entry.compressed_size as usize - *compressed_bytes as usize, + ); + let in_buf = &in_buf[..in_buf_max_len]; + + let fed_bytes_after_this = *compressed_bytes + in_buf.len() as u64; + + let has_more_input = if fed_bytes_after_this == self.entry.compressed_size as _ { + HasMoreInput::No + } else { + HasMoreInput::Yes + }; + let outcome = decompressor.decompress(in_buf, out, has_more_input)?; + trace!( + ?outcome, + compressed_bytes = *compressed_bytes, + uncompressed_bytes = *uncompressed_bytes, + eof = self.eof, + "decompressed" + ); + self.buffer.consume(outcome.bytes_read); + *compressed_bytes += outcome.bytes_read as u64; + + if outcome.bytes_written == 0 && self.eof { + // we're done, let's read the data descriptor (if there's one) + transition!(self.state => (S::ReadData { header, uncompressed_bytes, hasher, .. }) { + let metrics = EntryReadMetrics { + uncompressed_size: uncompressed_bytes, + crc32: hasher.finalize(), + }; + + if header.has_data_descriptor() { + S::ReadDataDescriptor { header, metrics } + } else { + S::Validate { header, metrics, descriptor: None } + } + }); + return self.process(out); + } + + // write the decompressed data to the hasher + hasher.update(&out[..outcome.bytes_written]); + // update the number of bytes we've decompressed + *uncompressed_bytes += outcome.bytes_written as u64; + + trace!( + compressed_bytes = *compressed_bytes, + uncompressed_bytes = *uncompressed_bytes, + "updated hasher" + ); + + Ok(FsmResult::Continue((self, outcome))) + } + S::ReadDataDescriptor { .. } => { + let mut input = Partial::new(self.buffer.data()); + match DataDescriptorRecord::mk_parser(self.entry.is_zip64).parse_next(&mut input) { + Ok(descriptor) => { + self.buffer + .consume(input.as_bytes().offset_from(&self.buffer.data())); + trace!("data descriptor = {:#?}", descriptor); + transition!(self.state => (S::ReadDataDescriptor { metrics, header, .. }) { + S::Validate { metrics, header, descriptor: Some(descriptor) } + }); + self.process(out) + } + Err(ErrMode::Incomplete(_)) => { + Ok(FsmResult::Continue((self, Default::default()))) + } + Err(_e) => Err(Error::Format(FormatError::InvalidDataDescriptor)), + } + } + S::Validate { + header, + metrics, + descriptor, + } => { + let expected_crc32 = if self.entry.crc32 != 0 { + self.entry.crc32 + } else if let Some(descriptor) = descriptor.as_ref() { + descriptor.crc32 + } else { + header.crc32 + }; + + let expected_size = if self.entry.uncompressed_size != 0 { + self.entry.uncompressed_size + } else if let Some(descriptor) = descriptor.as_ref() { + descriptor.uncompressed_size + } else { + header.uncompressed_size as u64 + }; + + if expected_size != metrics.uncompressed_size { + return Err(Error::Format(FormatError::WrongSize { + expected: expected_size, + actual: metrics.uncompressed_size, + })); + } + + if expected_crc32 != 0 && expected_crc32 != metrics.crc32 { + return Err(Error::Format(FormatError::WrongChecksum { + expected: expected_crc32, + actual: metrics.crc32, + })); + } + + Ok(FsmResult::Done(())) + } + S::Transition => { + unreachable!("the state machine should never be in the transition state") + } + } + } + + /// Returns a mutable slice with all the available space to write to. + /// + /// After writing to this, call [Self::fill] with the number of bytes written. + #[inline] + pub fn space(&mut self) -> &mut [u8] { + if self.buffer.available_space() == 0 { + self.buffer.shift(); + } + self.buffer.space() + } + + /// After having written data to [Self::space], call this to indicate how + /// many bytes were written. + /// + /// If this is called with zero, it indicates eof + #[inline] + pub fn fill(&mut self, count: usize) -> usize { + if count == 0 { + self.eof = true; + } + self.buffer.fill(count) + } +} + +enum AnyDecompressor { + Store(store_dec::StoreDec), + #[cfg(feature = "deflate")] + Deflate(Box), + #[cfg(feature = "deflate64")] + Deflate64(Box), + #[cfg(feature = "bzip2")] + Bzip2(bzip2_dec::Bzip2Dec), + #[cfg(feature = "lzma")] + Lzma(Box), + #[cfg(feature = "zstd")] + Zstd(zstd_dec::ZstdDec), +} + +#[derive(Default, Debug)] +pub struct DecompressOutcome { + /// Number of bytes read from input + pub bytes_read: usize, + + /// Number of bytes written to output + pub bytes_written: usize, +} + +pub enum HasMoreInput { + Yes, + No, +} + +trait Decompressor { + fn decompress( + &mut self, + in_buf: &[u8], + out: &mut [u8], + has_more_input: HasMoreInput, + ) -> Result; +} + +impl AnyDecompressor { + fn new(method: Method, #[allow(unused)] entry: &StoredEntryInner) -> Result { + let dec = match method { + Method::Store => Self::Store(Default::default()), + + #[cfg(feature = "deflate")] + Method::Deflate => Self::Deflate(Default::default()), + #[cfg(not(feature = "deflate"))] + Method::Deflate => { + let err = Error::Unsupported(UnsupportedError::MethodNotEnabled(method)); + return Err(err); + } + + #[cfg(feature = "deflate64")] + Method::Deflate64 => Self::Deflate64(Default::default()), + #[cfg(not(feature = "deflate64"))] + Method::Deflate64 => { + let err = Error::Unsupported(UnsupportedError::MethodNotEnabled(method)); + return Err(err); + } + + #[cfg(feature = "bzip2")] + Method::Bzip2 => Self::Bzip2(Default::default()), + #[cfg(not(feature = "bzip2"))] + Method::Bzip2 => { + let err = Error::Unsupported(UnsupportedError::MethodNotEnabled(method)); + return Err(err); + } + + #[cfg(feature = "lzma")] + Method::Lzma => Self::Lzma(Box::new(lzma_dec::LzmaDec::new(entry.uncompressed_size))), + #[cfg(not(feature = "lzma"))] + Method::Lzma => { + let err = Error::Unsupported(UnsupportedError::MethodNotEnabled(method)); + return Err(err); + } + + #[cfg(feature = "zstd")] + Method::Zstd => Self::Zstd(zstd_dec::ZstdDec::new()?), + #[cfg(not(feature = "zstd"))] + Method::Zstd => { + let err = Error::Unsupported(UnsupportedError::MethodNotEnabled(method)); + return Err(err); + } + + _ => { + let err = Error::Unsupported(UnsupportedError::MethodNotSupported(method)); + return Err(err); + } + }; + Ok(dec) + } +} + +impl Decompressor for AnyDecompressor { + #[inline] + fn decompress( + &mut self, + in_buf: &[u8], + out: &mut [u8], + has_more_input: HasMoreInput, + ) -> Result { + // forward to the appropriate decompressor + match self { + Self::Store(dec) => dec.decompress(in_buf, out, has_more_input), + #[cfg(feature = "deflate")] + Self::Deflate(dec) => dec.decompress(in_buf, out, has_more_input), + #[cfg(feature = "deflate64")] + Self::Deflate64(dec) => dec.decompress(in_buf, out, has_more_input), + #[cfg(feature = "bzip2")] + Self::Bzip2(dec) => dec.decompress(in_buf, out, has_more_input), + #[cfg(feature = "lzma")] + Self::Lzma(dec) => dec.decompress(in_buf, out, has_more_input), + #[cfg(feature = "zstd")] + Self::Zstd(dec) => dec.decompress(in_buf, out, has_more_input), + } + } +} diff --git a/rc-zip/src/fsm/entry/store_dec.rs b/rc-zip/src/fsm/entry/store_dec.rs new file mode 100644 index 0000000..784f23d --- /dev/null +++ b/rc-zip/src/fsm/entry/store_dec.rs @@ -0,0 +1,24 @@ +use std::cmp; + +use crate::error::Error; + +use super::{DecompressOutcome, Decompressor, HasMoreInput}; + +#[derive(Default)] +pub(crate) struct StoreDec; + +impl Decompressor for StoreDec { + fn decompress( + &mut self, + in_buf: &[u8], + out_buf: &mut [u8], + _has_more_input: HasMoreInput, + ) -> Result { + let len = cmp::min(in_buf.len(), out_buf.len()); + out_buf[..len].copy_from_slice(&in_buf[..len]); + Ok(DecompressOutcome { + bytes_read: len, + bytes_written: len, + }) + } +} diff --git a/rc-zip/src/fsm/entry/zstd_dec.rs b/rc-zip/src/fsm/entry/zstd_dec.rs new file mode 100644 index 0000000..276fefc --- /dev/null +++ b/rc-zip/src/fsm/entry/zstd_dec.rs @@ -0,0 +1,141 @@ +use std::{cmp, io::Write}; + +use crate::{error::Error, parse::Method}; + +use super::{DecompressOutcome, Decompressor, HasMoreInput}; + +use tracing::trace; +use zstd::stream::write::Decoder; + +#[derive(Default)] +enum State { + Writing(Box>>), + Draining(Vec), + + #[default] + Transition, +} + +pub(crate) struct ZstdDec { + state: State, +} + +impl ZstdDec { + pub fn new() -> Result { + Ok(Self { + state: State::Writing(Box::new(Decoder::new(vec![])?)), + }) + } +} + +impl Decompressor for ZstdDec { + fn decompress( + &mut self, + in_buf: &[u8], + out: &mut [u8], + has_more_input: HasMoreInput, + ) -> Result { + tracing::trace!( + in_buf_len = in_buf.len(), + out_len = out.len(), + remain_in_internal_buffer = self.internal_buf_mut().len(), + "decompress", + ); + + let mut outcome: DecompressOutcome = Default::default(); + + self.copy_to_out(out, &mut outcome); + if outcome.bytes_written > 0 { + trace!( + "still draining internal buffer, just copied {} bytes", + outcome.bytes_written + ); + return Ok(outcome); + } + + match &mut self.state { + State::Writing(stream) => { + let n = stream.write(in_buf).map_err(dec_err)?; + trace!( + "wrote {} bytes to decompressor (of {} available)", + n, + in_buf.len() + ); + outcome.bytes_read = n; + + // if we haven't written all the input, and we haven't gotten + // any output, then we need to keep going + if n != 0 && n < in_buf.len() && self.internal_buf_mut().is_empty() { + // note: the n != 0 here is because apparently there can be a 10-byte + // trailer after LZMA compressed data? and the decoder will _refuse_ + // to let us write them, so when we have just these 10 bytes left, + // it's good to just let the decoder finish up. + trace!("didn't write all output AND no output yet, so keep going"); + return self.decompress(&in_buf[n..], out, has_more_input); + } + + match has_more_input { + HasMoreInput::Yes => { + // keep going + trace!("more input to come"); + } + HasMoreInput::No => { + trace!("no more input to come"); + match std::mem::take(&mut self.state) { + State::Writing(mut stream) => { + trace!("finishing..."); + stream.flush().map_err(dec_err)?; + self.state = State::Draining(stream.into_inner()); + } + _ => unreachable!(), + } + } + } + } + State::Draining(_) => { + // keep going + } + State::Transition => unreachable!(), + } + + self.copy_to_out(out, &mut outcome); + trace!("decompressor gave us {} bytes", outcome.bytes_written); + Ok(outcome) + } +} + +fn dec_err(e: impl std::fmt::Display) -> Error { + Error::Decompression { + method: Method::Zstd, + msg: e.to_string(), + } +} + +impl ZstdDec { + #[inline(always)] + fn internal_buf_mut(&mut self) -> &mut Vec { + match &mut self.state { + State::Writing(stream) => stream.get_mut(), + State::Draining(buf) => buf, + State::Transition => unreachable!(), + } + } + + fn copy_to_out(&mut self, mut out: &mut [u8], outcome: &mut DecompressOutcome) { + let internal_buf = self.internal_buf_mut(); + + while !out.is_empty() && !internal_buf.is_empty() { + let to_copy = cmp::min(out.len(), internal_buf.len()); + trace!("copying {} bytes from internal buffer", to_copy); + out[..to_copy].copy_from_slice(&internal_buf[..to_copy]); + out = &mut out[to_copy..]; + + // rotate the internal buffer + internal_buf.rotate_left(to_copy); + // and shrink it + internal_buf.resize(internal_buf.len() - to_copy, 0); + + outcome.bytes_written += to_copy; + } + } +} diff --git a/rc-zip/src/fsm/mod.rs b/rc-zip/src/fsm/mod.rs new file mode 100644 index 0000000..d43c324 --- /dev/null +++ b/rc-zip/src/fsm/mod.rs @@ -0,0 +1,35 @@ +//! State machines built atop parsers, ready to bring your own I/O with. +//! +//! Parsers are just part of the puzzle when it comes to zip files: finding the +//! central directory is non-trivial and involves seeking around the input: +//! [ArchiveFsm] provides a state machine to handle this. +//! +//! Similarly, reading an entry involves reading the local header, then the +//! data (while calculating the CRC32), then the data descriptor, and then +//! checking whether the uncompressed size and CRC32 match the values in the +//! central directory. + +macro_rules! transition { + ($state: expr => ($pattern: pat) $body: expr) => { + $state = if let $pattern = std::mem::take(&mut $state) { + $body + } else { + unreachable!() + }; + }; +} + +mod archive; +pub use archive::ArchiveFsm; + +mod entry; +pub use entry::EntryFsm; + +/// Indicates whether or not the state machine has completed its work +pub enum FsmResult { + /// The I/O loop needs to continue, the state machine is given back. + Continue(M), + + /// The state machine is done, and the result is returned. + Done(R), +} diff --git a/rc-zip/src/lib.rs b/rc-zip/src/lib.rs new file mode 100644 index 0000000..50fda8c --- /dev/null +++ b/rc-zip/src/lib.rs @@ -0,0 +1,21 @@ +#![warn(missing_docs)] + +//! rc-zip is a [sans-io](https://sans-io.readthedocs.io/how-to-sans-io.html) library for reading zip files. +//! +//! It's made up of a bunch of types representing the various parts of a zip +//! file, winnow parsers that can turn byte buffers into those types, and +//! state machines that can use those parsers to read zip files from a stream. +//! +//! This crate is low-level, you may be interested in either of those higher +//! level wrappers: +//! +//! * [rc-zip-sync](https://crates.io/crates/rc-zip-sync) for using std I/O traits +//! * [rc-zip-tokio](https://crates.io/crates/rc-zip-tokio) for using tokio I/O traits + +pub mod encoding; +pub mod error; +pub mod fsm; +pub mod parse; + +#[cfg(any(test, feature = "corpus"))] +pub mod corpus; diff --git a/rc-zip/src/parse/archive.rs b/rc-zip/src/parse/archive.rs new file mode 100644 index 0000000..4b464eb --- /dev/null +++ b/rc-zip/src/parse/archive.rs @@ -0,0 +1,344 @@ +use chrono::{DateTime, Utc}; +use num_enum::{FromPrimitive, IntoPrimitive}; + +use crate::{ + encoding::Encoding, + parse::{ExtraField, Mode, Version}, +}; + +/// An Archive contains general information about a zip files, along with a list +/// of [entries][StoredEntry]. +/// +/// It is obtained through a state machine like +/// [ArchiveFsm](crate::fsm::ArchiveFsm), although end-users tend to use +/// higher-levelr interfaces like +/// [rc-zip-sync](https://crates.io/crates/rc-zip-sync) or +/// [rc-zip-tokio](https://crates.io/crates/rc-zip-tokio). +pub struct Archive { + pub(crate) size: u64, + pub(crate) encoding: Encoding, + pub(crate) entries: Vec, + pub(crate) comment: Option, +} + +impl Archive { + /// The size of .zip file that was read, in bytes. + pub fn size(&self) -> u64 { + self.size + } + + /// Iterate over all files in this zip, read from the central directory. + pub fn entries(&self) -> impl Iterator { + self.entries.iter() + } + + /// Attempts to look up an entry by name. This is usually a bad idea, + /// as names aren't necessarily normalized in zip archives. + pub fn by_name>(&self, name: N) -> Option<&StoredEntry> { + self.entries.iter().find(|&x| x.name() == name.as_ref()) + } + + /// Returns the detected character encoding for text fields + /// (names, comments) inside this zip archive. + pub fn encoding(&self) -> Encoding { + self.encoding + } + + /// Returns the comment for this archive, if any. When reading + /// a zip file with an empty comment field, this will return None. + pub fn comment(&self) -> Option<&String> { + self.comment.as_ref() + } +} + +/// Describes a zip archive entry (a file, a directory, a symlink) +/// +/// `Entry` contains normalized metadata fields, that can be set when +/// writing a zip archive. Additional metadata, along with the information +/// required to extract an entry, are available in [StoredEntry][] instead. +#[derive(Clone)] +pub struct Entry { + /// Name of the file + /// Must be a relative path, not start with a drive letter (e.g. C:), + /// and must use forward slashes instead of back slashes + pub name: String, + + /// Compression method + /// + /// See [Method][] for more details. + pub method: Method, + + /// Comment is any arbitrary user-defined string shorter than 64KiB + pub comment: Option, + + /// Modified timestamp + pub modified: chrono::DateTime, + + /// Created timestamp + pub created: Option>, + + /// Accessed timestamp + pub accessed: Option>, +} + +/// An entry as stored into an Archive. Contains additional metadata and offset information. +/// +/// Whereas [Entry][] is archive-independent, [StoredEntry][] contains information that is tied to +/// a specific archive. +/// +/// When reading archives, one deals with a list of [StoredEntry][], whereas when writing one, one +/// typically only specifies an [Entry][] and provides the entry's contents: fields like the CRC32 +/// hash, uncompressed size, and compressed size are derived automatically from the input. +#[derive(Clone)] +pub struct StoredEntry { + /// Archive-independent information + /// + /// This contains the entry's name, timestamps, comment, compression method. + pub entry: Entry, + + /// Offset of the local file header in the zip file + /// + /// ```text + /// [optional non-zip data] + /// [local file header 1] <------ header_offset points here + /// [encryption header 1] + /// [file data 1] + /// [data descriptor 1] + /// ... + /// [central directory] + /// [optional zip64 end of central directory info] + /// [end of central directory record] + /// ``` + pub header_offset: u64, + + /// External attributes (zip) + pub external_attrs: u32, + + /// Version of zip supported by the tool that crated this archive. + pub creator_version: Version, + + /// Version of zip needed to extract this archive. + pub reader_version: Version, + + /// General purpose bit flag + /// + /// In the zip format, the most noteworthy flag (bit 11) is for UTF-8 names. + /// Other flags can indicate: encryption (unsupported), various compression + /// settings (depending on the [Method] used). + /// + /// For LZMA, general-purpose bit 1 denotes the EOS marker. + pub flags: u16, + + /// Unix user ID + /// + /// Only present if a Unix extra field or New Unix extra field was found. + pub uid: Option, + + /// Unix group ID + /// + /// Only present if a Unix extra field or New Unix extra field was found. + pub gid: Option, + + /// File mode + pub mode: Mode, + + /// Any extra fields recognized while parsing the file. + /// + /// Most of these should be normalized and accessible as other fields, + /// but they are also made available here raw. + pub extra_fields: Vec, + + /// These fields are cheap to clone and needed for entry readers, + /// hence them being in a separate struct + pub inner: StoredEntryInner, +} + +/// Fields required to read an entry properly, typically cloned into owned entry +/// readers. +#[derive(Clone, Copy, Debug)] +pub struct StoredEntryInner { + /// CRC-32 hash as found in the central directory. + /// + /// Note that this may be zero, and the actual CRC32 might be in the local header, or (more + /// commonly) in the data descriptor instead. + pub crc32: u32, + + /// Size in bytes, after compression + pub compressed_size: u64, + + /// Size in bytes, before compression + /// + /// This will be zero for directories. + pub uncompressed_size: u64, + + /// True if this entry was read from a zip64 archive + pub is_zip64: bool, +} + +impl StoredEntry { + /// Returns the entry's name. See also + /// [sanitized_name()](StoredEntry::sanitized_name), which returns a + /// sanitized version of the name. + /// + /// This should be a relative path, separated by `/`. However, there are zip + /// files in the wild with all sorts of evil variants, so, be conservative + /// in what you accept. + pub fn name(&self) -> &str { + self.entry.name.as_ref() + } + + /// Returns a sanitized version of the entry's name, if it + /// seems safe. In particular, if this method feels like the + /// entry name is trying to do a zip slip (cf. + /// ), it'll return + /// None. + /// + /// Other than that, it will strip any leading slashes on non-Windows OSes. + pub fn sanitized_name(&self) -> Option<&str> { + let name = self.name(); + + // refuse entries with traversed/absolute path to mitigate zip slip + if name.contains("..") { + return None; + } + + #[cfg(windows)] + { + if name.contains(":\\") || name.starts_with("\\") { + return None; + } + Some(name) + } + + #[cfg(not(windows))] + { + // strip absolute prefix on entries pointing to root path + let mut entry_chars = name.chars(); + let mut name = name; + while name.starts_with('/') { + entry_chars.next(); + name = entry_chars.as_str() + } + Some(name) + } + } + + /// The entry's comment, if any. + /// + /// When reading a zip file, an empty comment results in None. + pub fn comment(&self) -> Option<&str> { + self.entry.comment.as_ref().map(|x| x.as_ref()) + } + + /// The compression method used for this entry + #[inline(always)] + pub fn method(&self) -> Method { + self.entry.method + } + + /// This entry's "last modified" timestamp - with caveats + /// + /// Due to the history of the ZIP file format, this may be inaccurate. It may be offset + /// by a few hours, if there is no extended timestamp information. It may have a resolution + /// as low as two seconds, if only MSDOS timestamps are present. It may default to the Unix + /// epoch, if something went really wrong. + /// + /// If you're reading this after the year 2038, or after the year 2108, godspeed. + #[inline(always)] + pub fn modified(&self) -> DateTime { + self.entry.modified + } + + /// This entry's "created" timestamp, if available. + /// + /// See [StoredEntry::modified()] for caveats. + #[inline(always)] + pub fn created(&self) -> Option<&DateTime> { + self.entry.created.as_ref() + } + + /// This entry's "last accessed" timestamp, if available. + /// + /// See [StoredEntry::modified()] for caveats. + #[inline(always)] + pub fn accessed(&self) -> Option<&DateTime> { + self.entry.accessed.as_ref() + } +} + +/// The contents of an entry: a directory, a file, or a symbolic link. +#[derive(Debug)] +pub enum EntryContents { + /// The entry is a directory + Directory, + + /// The entry is a file + File, + + /// The entry is a symbolic link + Symlink, +} + +impl StoredEntry { + /// Determine [EntryContents] of this entry based on its mode. + pub fn contents(&self) -> EntryContents { + if self.mode.has(Mode::SYMLINK) { + EntryContents::Symlink + } else if self.mode.has(Mode::DIR) { + EntryContents::Directory + } else { + EntryContents::File + } + } +} + +/// Compression method used for a file entry. +/// +/// In archives that follow [ISO/IEC 21320-1:2015](https://www.iso.org/standard/60101.html), only +/// [Store][Method::Store] and [Deflate][Method::Deflate] should be used. +/// +/// However, in the wild, it is not too uncommon to encounter [Bzip2][Method::Bzip2], +/// [Lzma][Method::Lzma] or others. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, IntoPrimitive, FromPrimitive)] +#[repr(u16)] +pub enum Method { + /// No compression is applied + Store = 0, + + /// [DEFLATE (RFC 1951)](https://www.ietf.org/rfc/rfc1951.txt) + Deflate = 8, + + /// [DEFLATE64](https://deflate64.com/) + Deflate64 = 9, + + /// [BZIP-2](https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf) + Bzip2 = 12, + + /// [LZMA](https://github.com/jljusten/LZMA-SDK/blob/master/DOC/lzma-specification.txt) + Lzma = 14, + + /// [zstd](https://datatracker.ietf.org/doc/html/rfc8878) + Zstd = 93, + + /// [MP3](https://www.iso.org/obp/ui/#iso:std:iso-iec:11172:-3:ed-1:v1:en) + Mp3 = 94, + + /// [XZ](https://tukaani.org/xz/xz-file-format.txt) + Xz = 95, + + /// [JPEG](https://jpeg.org/jpeg/) + Jpeg = 96, + + /// [WavPack](https://www.wavpack.com/) + WavPack = 97, + + /// [PPMd](https://en.wikipedia.org/wiki/Prediction_by_partial_matching) + Ppmd = 98, + + /// AE-x encryption marker (see Appendix E of appnote) + Aex = 99, + + /// A compression method that isn't recognized by this crate. + #[num_enum(catch_all)] + Unrecognized(u16), +} diff --git a/rc-zip/src/parse/date_time.rs b/rc-zip/src/parse/date_time.rs new file mode 100644 index 0000000..2ebdd87 --- /dev/null +++ b/rc-zip/src/parse/date_time.rs @@ -0,0 +1,109 @@ +use chrono::{ + offset::{LocalResult, TimeZone, Utc}, + DateTime, Timelike, +}; +use std::fmt; +use winnow::{ + binary::{le_u16, le_u64}, + seq, PResult, Parser, Partial, +}; + +/// A timestamp in MS-DOS format +/// +/// Represents dates from year 1980 to 2180, with 2 second precision. +#[derive(Clone, Copy, Eq, PartialEq)] +pub struct MsdosTimestamp { + /// Time in 2-second intervals + pub time: u16, + + /// Date in MS-DOS format, cf. + pub date: u16, +} + +impl fmt::Debug for MsdosTimestamp { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.to_datetime() { + Some(dt) => write!(f, "MsdosTimestamp({})", dt), + None => write!(f, "MsdosTimestamp(?)"), + } + } +} + +impl MsdosTimestamp { + /// Parser for MS-DOS timestamps + pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + seq! {Self { + time: le_u16, + date: le_u16, + }} + .parse_next(i) + } + + /// Attempts to convert to a chrono UTC date time + pub fn to_datetime(&self) -> Option> { + // see https://docs.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-dosdatetimetofiletime + let date = match { + // bits 0-4: day of the month (1-31) + let d = (self.date & 0b1_1111) as u32; + // bits 5-8: month (1 = january, 2 = february and so on) + let m = ((self.date >> 5) & 0b1111) as u32; + // bits 9-15: year offset from 1980 + let y = ((self.date >> 9) + 1980) as i32; + Utc.with_ymd_and_hms(y, m, d, 0, 0, 0) + } { + LocalResult::Single(date) => date, + _ => return None, + }; + + // bits 0-4: second divided by 2 + let s = (self.time & 0b1_1111) as u32 * 2; + // bits 5-10: minute (0-59) + let m = (self.time >> 5 & 0b11_1111) as u32; + // bits 11-15: hour (0-23 on a 24-hour clock) + let h = (self.time >> 11) as u32; + date.with_hour(h)?.with_minute(m)?.with_second(s) + } +} + +/// A timestamp in NTFS format. +#[derive(Clone, Copy, Eq, PartialEq)] +pub struct NtfsTimestamp { + /// Timestamp in 100ns intervals since 1601-01-01 00:00:00 UTC + pub timestamp: u64, +} + +impl fmt::Debug for NtfsTimestamp { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.to_datetime() { + Some(dt) => write!(f, "NtfsTimestamp({})", dt), + None => write!(f, "NtfsTimestamp(?)"), + } + } +} + +impl NtfsTimestamp { + /// Parse an MS-DOS timestamp from a byte slice + pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + le_u64.map(|timestamp| Self { timestamp }).parse_next(i) + } + + /// Attempts to convert to a chrono UTC date time + pub fn to_datetime(&self) -> Option> { + // windows timestamp resolution + let ticks_per_second = 10_000_000; + let secs = (self.timestamp / ticks_per_second) as i64; + let nsecs = ((self.timestamp % ticks_per_second) * 100) as u32; + let epoch = Utc.with_ymd_and_hms(1601, 1, 1, 0, 0, 0).single()?; + match Utc.timestamp_opt(epoch.timestamp() + secs, nsecs) { + LocalResult::Single(date) => Some(date), + _ => None, + } + } +} + +pub(crate) fn zero_datetime() -> chrono::DateTime { + chrono::DateTime::from_naive_utc_and_offset( + chrono::naive::NaiveDateTime::from_timestamp_opt(0, 0).unwrap(), + chrono::offset::Utc, + ) +} diff --git a/rc-zip/src/parse/directory_header.rs b/rc-zip/src/parse/directory_header.rs new file mode 100644 index 0000000..db38717 --- /dev/null +++ b/rc-zip/src/parse/directory_header.rs @@ -0,0 +1,272 @@ +use chrono::{offset::TimeZone, DateTime, Utc}; +use tracing::trace; +use winnow::{ + binary::{le_u16, le_u32}, + prelude::PResult, + token::tag, + Parser, Partial, +}; + +use crate::{ + encoding::detect_utf8, + encoding::Encoding, + error::{Error, FormatError}, + parse::{ + zero_datetime, Entry, ExtraField, ExtraFieldSettings, HostSystem, Mode, MsdosMode, + MsdosTimestamp, NtfsAttr, StoredEntry, StoredEntryInner, UnixMode, Version, ZipBytes, + ZipString, + }, +}; + +/// 4.3.12 Central directory structure: File header +pub struct DirectoryHeader { + /// version made by + pub creator_version: Version, + + /// version needed to extract + pub reader_version: Version, + + /// general purpose bit flag + pub flags: u16, + + /// compression method + pub method: u16, + + /// last mod file datetime + pub modified: MsdosTimestamp, + + /// crc32 hash + pub crc32: u32, + + /// compressed size + pub compressed_size: u32, + + /// uncompressed size + pub uncompressed_size: u32, + + /// disk number start + pub disk_nbr_start: u16, + + /// internal file attributes + pub internal_attrs: u16, + + /// external file attributes + pub external_attrs: u32, + + /// relative offset of local header + pub header_offset: u32, + + /// name + pub name: ZipString, // FIXME: should this be Cow? + + /// extra + pub extra: ZipBytes, // FIXME: should this be Cow<[u8]>? + + /// comment + pub comment: ZipString, +} + +impl DirectoryHeader { + const SIGNATURE: &'static str = "PK\x01\x02"; + + /// Parser for the central directory file header + pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + _ = tag(Self::SIGNATURE).parse_next(i)?; + let creator_version = Version::parser.parse_next(i)?; + let reader_version = Version::parser.parse_next(i)?; + let flags = le_u16.parse_next(i)?; + let method = le_u16.parse_next(i)?; + let modified = MsdosTimestamp::parser.parse_next(i)?; + let crc32 = le_u32.parse_next(i)?; + let compressed_size = le_u32.parse_next(i)?; + let uncompressed_size = le_u32.parse_next(i)?; + let name_len = le_u16.parse_next(i)?; + let extra_len = le_u16.parse_next(i)?; + let comment_len = le_u16.parse_next(i)?; + let disk_nbr_start = le_u16.parse_next(i)?; + let internal_attrs = le_u16.parse_next(i)?; + let external_attrs = le_u32.parse_next(i)?; + let header_offset = le_u32.parse_next(i)?; + + let name = ZipString::parser(name_len).parse_next(i)?; + let extra = ZipBytes::parser(extra_len).parse_next(i)?; + let comment = ZipString::parser(comment_len).parse_next(i)?; + + Ok(Self { + creator_version, + reader_version, + flags, + method, + modified, + crc32, + compressed_size, + uncompressed_size, + disk_nbr_start, + internal_attrs, + external_attrs, + header_offset, + name, + extra, + comment, + }) + } +} + +impl DirectoryHeader { + /// Returns true if the name or comment is not valid UTF-8 + pub fn is_non_utf8(&self) -> bool { + let (valid1, require1) = detect_utf8(&self.name.0[..]); + let (valid2, require2) = detect_utf8(&self.comment.0[..]); + if !valid1 || !valid2 { + // definitely not utf-8 + return true; + } + + if !require1 && !require2 { + // name and comment only use single-byte runes that overlap with UTF-8 + return false; + } + + // Might be UTF-8, might be some other encoding; preserve existing flag. + // Some ZIP writers use UTF-8 encoding without setting the UTF-8 flag. + // Since it is impossible to always distinguish valid UTF-8 from some + // other encoding (e.g., GBK or Shift-JIS), we trust the flag. + self.flags & 0x800 == 0 + } + + /// Converts the directory header into a stored entry: this involves + /// parsing the extra fields and converting the timestamps. + pub fn as_stored_entry( + &self, + is_zip64: bool, + encoding: Encoding, + global_offset: u64, + ) -> Result { + let mut comment: Option = None; + if let Some(comment_field) = self.comment.clone().into_option() { + comment = Some(encoding.decode(&comment_field.0)?); + } + + let name = encoding.decode(&self.name.0)?; + + let mut compressed_size = self.compressed_size as u64; + let mut uncompressed_size = self.uncompressed_size as u64; + let mut header_offset = self.header_offset as u64 + global_offset; + + let mut modified: Option> = None; + let mut created: Option> = None; + let mut accessed: Option> = None; + + let mut uid: Option = None; + let mut gid: Option = None; + + let mut extra_fields: Vec = Vec::new(); + + let settings = ExtraFieldSettings { + needs_compressed_size: self.compressed_size == !0u32, + needs_uncompressed_size: self.uncompressed_size == !0u32, + needs_header_offset: self.header_offset == !0u32, + }; + + let mut slice = Partial::new(&self.extra.0[..]); + while !slice.is_empty() { + match ExtraField::mk_parser(settings).parse_next(&mut slice) { + Ok(ef) => { + match &ef { + ExtraField::Zip64(z64) => { + if let Some(n) = z64.uncompressed_size { + uncompressed_size = n; + } + if let Some(n) = z64.compressed_size { + compressed_size = n; + } + if let Some(n) = z64.header_offset { + header_offset = n; + } + } + ExtraField::Timestamp(ts) => { + modified = Utc.timestamp_opt(ts.mtime as i64, 0).single(); + } + ExtraField::Ntfs(nf) => { + for attr in &nf.attrs { + // note: other attributes are unsupported + if let NtfsAttr::Attr1(attr) = attr { + modified = attr.mtime.to_datetime(); + created = attr.ctime.to_datetime(); + accessed = attr.atime.to_datetime(); + } + } + } + ExtraField::Unix(uf) => { + modified = Utc.timestamp_opt(uf.mtime as i64, 0).single(); + if uid.is_none() { + uid = Some(uf.uid as u32); + } + if gid.is_none() { + gid = Some(uf.gid as u32); + } + } + ExtraField::NewUnix(uf) => { + uid = Some(uf.uid as u32); + gid = Some(uf.uid as u32); + } + _ => {} + }; + extra_fields.push(ef); + } + Err(e) => { + trace!("extra field error: {:#?}", e); + return Err(FormatError::InvalidExtraField.into()); + } + } + } + + let modified = match modified { + Some(m) => Some(m), + None => self.modified.to_datetime(), + }; + + let mut mode: Mode = match self.creator_version.host_system() { + HostSystem::Unix | HostSystem::Osx => UnixMode(self.external_attrs >> 16).into(), + HostSystem::WindowsNtfs | HostSystem::Vfat | HostSystem::MsDos => { + MsdosMode(self.external_attrs).into() + } + _ => Mode(0), + }; + if name.ends_with('/') { + // believe it or not, this is straight from the APPNOTE + mode |= Mode::DIR + }; + + Ok(StoredEntry { + entry: Entry { + name, + method: self.method.into(), + comment, + modified: modified.unwrap_or_else(zero_datetime), + created, + accessed, + }, + + creator_version: self.creator_version, + reader_version: self.reader_version, + flags: self.flags, + + inner: StoredEntryInner { + crc32: self.crc32, + compressed_size, + uncompressed_size, + is_zip64, + }, + header_offset, + + uid, + gid, + mode, + + extra_fields, + + external_attrs: self.external_attrs, + }) + } +} diff --git a/rc-zip/src/parse/eocd.rs b/rc-zip/src/parse/eocd.rs new file mode 100644 index 0000000..386b091 --- /dev/null +++ b/rc-zip/src/parse/eocd.rs @@ -0,0 +1,302 @@ +use tracing::trace; +use winnow::{ + binary::{le_u16, le_u32, le_u64, length_take}, + seq, + token::tag, + PResult, Parser, Partial, +}; + +use crate::{ + error::{Error, FormatError}, + parse::ZipString, +}; + +/// 4.3.16 End of central directory record: +#[derive(Debug)] +pub struct EndOfCentralDirectoryRecord { + /// number of this disk + pub disk_nbr: u16, + + /// number of the disk with the start of the central directory + pub dir_disk_nbr: u16, + + /// total number of entries in the central directory on this disk + pub dir_records_this_disk: u16, + + /// total number of entries in the central directory + pub directory_records: u16, + + /// size of the central directory + pub directory_size: u32, + + /// offset of start of central directory with respect to the starting disk number + pub directory_offset: u32, + + /// .ZIP file comment + pub comment: ZipString, +} + +impl EndOfCentralDirectoryRecord { + /// Does not include comment size & comment data + const MIN_LENGTH: usize = 20; + const SIGNATURE: &'static str = "PK\x05\x06"; + + /// Find the end of central directory record in a block of data + pub fn find_in_block(b: &[u8]) -> Option> { + for i in (0..(b.len() - Self::MIN_LENGTH + 1)).rev() { + let mut input = Partial::new(&b[i..]); + if let Ok(directory) = Self::parser.parse_next(&mut input) { + return Some(Located { + offset: i as u64, + inner: directory, + }); + } + } + None + } + + /// Parser for the end of central directory record + pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + let _ = tag(Self::SIGNATURE).parse_next(i)?; + seq! {Self { + disk_nbr: le_u16, + dir_disk_nbr: le_u16, + dir_records_this_disk: le_u16, + directory_records: le_u16, + directory_size: le_u32, + directory_offset: le_u32, + comment: length_take(le_u16).map(ZipString::from), + }} + .parse_next(i) + } +} + +/// 4.3.15 Zip64 end of central directory locator +#[derive(Debug)] +pub struct EndOfCentralDirectory64Locator { + /// number of the disk with the start of the zip64 end of central directory + pub dir_disk_number: u32, + /// relative offset of the zip64 end of central directory record + pub directory_offset: u64, + /// total number of disks + pub total_disks: u32, +} + +impl EndOfCentralDirectory64Locator { + /// Length of the locator + pub const LENGTH: usize = 20; + const SIGNATURE: &'static str = "PK\x06\x07"; + + /// Parser for the zip64 end of central directory locator + pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + _ = tag(Self::SIGNATURE).parse_next(i)?; + seq! {Self { + dir_disk_number: le_u32, + directory_offset: le_u64, + total_disks: le_u32, + }} + .parse_next(i) + } +} + +/// 4.3.14 Zip64 end of central directory record +#[derive(Debug)] +pub struct EndOfCentralDirectory64Record { + /// size of zip64 end of central directory record + pub record_size: u64, + + /// version made by + pub creator_version: u16, + + /// version needed to extract + pub reader_version: u16, + + /// number of this disk + pub disk_nbr: u32, + + /// number of the disk with the start of the central directory + pub dir_disk_nbr: u32, + + /// total number of entries in the central directory on this disk + pub dir_records_this_disk: u64, + + /// total number of entries in the central directory + pub directory_records: u64, + + /// size of the central directory + pub directory_size: u64, + + /// offset of the start of central directory with respect to the + /// starting disk number + pub directory_offset: u64, +} + +impl EndOfCentralDirectory64Record { + const SIGNATURE: &'static str = "PK\x06\x06"; + + /// Parser for the zip64 end of central directory record + pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + _ = tag(Self::SIGNATURE).parse_next(i)?; + seq! {Self { + record_size: le_u64, + creator_version: le_u16, + reader_version: le_u16, + disk_nbr: le_u32, + dir_disk_nbr: le_u32, + dir_records_this_disk: le_u64, + directory_records: le_u64, + directory_size: le_u64, + directory_offset: le_u64, + }} + .parse_next(i) + } +} + +/// A zip structure and its location in the input file +#[derive(Debug)] +pub struct Located { + /// Absolute by offset from the start of the file + pub offset: u64, + + /// The structure itself + pub inner: T, +} + +impl std::ops::Deref for Located { + type Target = T; + fn deref(&self) -> &Self::Target { + &self.inner + } +} + +impl std::ops::DerefMut for Located { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.inner + } +} + +/// Coalesces zip and zip64 "end of central directory" record info +pub struct EndOfCentralDirectory { + /// The end of central directory record + pub dir: Located, + + /// The zip64 end of central directory record + pub dir64: Option>, + + /// Zip files may be prepended by arbitrary data, this is how much + /// data is at the beginning of the file that isn't part of the zip + pub global_offset: i64, +} + +impl EndOfCentralDirectory { + pub(crate) fn new( + size: u64, + dir: Located, + dir64: Option>, + ) -> Result { + let mut res = Self { + dir, + dir64, + global_offset: 0, + }; + + // + // Pure .zip files look like this: + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // <------directory_size-----> + // [ Data 1 ][ Data 2 ][ Central directory ][ ??? ] + // ^ ^ ^ + // 0 directory_offset directory_end_offset + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // + // But there exist some valid zip archives with padding at the beginning, like so: + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // <--global_offset-> <------directory_size-----> + // [ Padding ][ Data 1 ][ Data 2 ][ Central directory ][ ??? ] + // ^ ^ ^ ^ + // 0 global_offset computed_directory_offset directory_end_offset + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // + // (e.g. https://www.icculus.org/mojosetup/ installers are ELF binaries with a .zip file appended) + // + // `directory_end_offfset` is found by scanning the file (so it accounts for padding), but + // `directory_offset` is found by reading a data structure (so it does not account for padding). + // If we just trusted `directory_offset`, we'd be reading the central directory at the wrong place: + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // <------directory_size-----> + // [ Padding ][ Data 1 ][ Data 2 ][ Central directory ][ ??? ] + // ^ ^ ^ + // 0 directory_offset - woops! directory_end_offset + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + let computed_directory_offset = res.located_directory_offset() - res.directory_size(); + + // did we find a valid offset? + if (0..size).contains(&computed_directory_offset) { + // that's different from the recorded one? + if computed_directory_offset != res.directory_offset() { + // then assume the whole file is offset + res.global_offset = + computed_directory_offset as i64 - res.directory_offset() as i64; + res.set_directory_offset(computed_directory_offset); + } + } + + // make sure directory_offset points to somewhere in our file + trace!( + "directory offset = {}, valid range = 0..{}", + res.directory_offset(), + size + ); + if !(0..size).contains(&res.directory_offset()) { + return Err(FormatError::DirectoryOffsetPointsOutsideFile.into()); + } + + Ok(res) + } + + #[inline] + pub(crate) fn located_directory_offset(&self) -> u64 { + match self.dir64.as_ref() { + Some(d64) => d64.offset, + None => self.dir.offset, + } + } + + #[inline] + pub(crate) fn directory_offset(&self) -> u64 { + match self.dir64.as_ref() { + Some(d64) => d64.directory_offset, + None => self.dir.directory_offset as u64, + } + } + + #[inline] + pub(crate) fn directory_size(&self) -> u64 { + match self.dir64.as_ref() { + Some(d64) => d64.directory_size, + None => self.dir.directory_size as u64, + } + } + + #[inline] + pub(crate) fn set_directory_offset(&mut self, offset: u64) { + match self.dir64.as_mut() { + Some(d64) => d64.directory_offset = offset, + None => self.dir.directory_offset = offset as u32, + }; + } + + #[inline] + pub(crate) fn directory_records(&self) -> u64 { + match self.dir64.as_ref() { + Some(d64) => d64.directory_records, + None => self.dir.directory_records as u64, + } + } + + #[inline] + pub(crate) fn comment(&self) -> &ZipString { + &self.dir.comment + } +} diff --git a/rc-zip/src/parse/extra_field.rs b/rc-zip/src/parse/extra_field.rs new file mode 100644 index 0000000..9b3693b --- /dev/null +++ b/rc-zip/src/parse/extra_field.rs @@ -0,0 +1,315 @@ +use tracing::trace; +use winnow::{ + binary::{le_u16, le_u32, le_u64, le_u8, length_take}, + combinator::{cond, opt, preceded, repeat_till}, + error::{ErrMode, ErrorKind, ParserError, StrContext}, + seq, + token::{tag, take}, + PResult, Parser, Partial, +}; + +use crate::parse::{NtfsTimestamp, ZipBytes}; + +/// 4.4.28 extra field: (Variable) +pub(crate) struct ExtraFieldRecord<'a> { + pub(crate) tag: u16, + pub(crate) payload: &'a [u8], +} + +impl<'a> ExtraFieldRecord<'a> { + pub(crate) fn parser(i: &mut Partial<&'a [u8]>) -> PResult { + seq! {Self { + tag: le_u16, + payload: length_take(le_u16), + }} + .parse_next(i) + } +} + +// Useful because zip64 extended information extra field has fixed order *but* +// optional fields. From the appnote: +// +// If one of the size or offset fields in the Local or Central directory record +// is too small to hold the required data, a Zip64 extended information record +// is created. The order of the fields in the zip64 extended information record +// is fixed, but the fields MUST only appear if the corresponding Local or +// Central directory record field is set to 0xFFFF or 0xFFFFFFFF. +#[derive(Debug, Clone, Copy)] +pub(crate) struct ExtraFieldSettings { + pub(crate) needs_uncompressed_size: bool, + pub(crate) needs_compressed_size: bool, + pub(crate) needs_header_offset: bool, +} + +/// Information stored in the central directory header `extra` field +/// +/// This typically contains timestamps, file sizes and offsets, file mode, uid/gid, etc. +/// +/// See `extrafld.txt` in this crate's source distribution. +#[derive(Clone)] +pub enum ExtraField { + /// Zip64 extended information extra field + Zip64(ExtraZip64Field), + /// Extended timestamp + Timestamp(ExtraTimestampField), + /// UNIX & Info-Zip UNIX + Unix(ExtraUnixField), + /// New UNIX extra field + NewUnix(ExtraNewUnixField), + /// NTFS (Win9x/WinNT FileTimes) + Ntfs(ExtraNtfsField), + /// Unknown extra field, with tag + Unknown { + /// tag of the extra field + tag: u16, + }, +} + +impl ExtraField { + pub(crate) fn mk_parser( + settings: ExtraFieldSettings, + ) -> impl FnMut(&mut Partial<&'_ [u8]>) -> PResult { + move |i| { + use ExtraField as EF; + let rec = ExtraFieldRecord::parser.parse_next(i)?; + trace!("parsing extra field record, tag {:04x}", rec.tag); + let payload = &mut Partial::new(rec.payload); + + let variant = match rec.tag { + ExtraZip64Field::TAG => opt(ExtraZip64Field::mk_parser(settings).map(EF::Zip64)) + .context(StrContext::Label("zip64")) + .parse_next(payload)?, + ExtraTimestampField::TAG => opt(ExtraTimestampField::parser.map(EF::Timestamp)) + .context(StrContext::Label("timestamp")) + .parse_next(payload)?, + ExtraNtfsField::TAG => { + opt(ExtraNtfsField::parse.map(EF::Ntfs)).parse_next(payload)? + } + ExtraUnixField::TAG | ExtraUnixField::TAG_INFOZIP => { + opt(ExtraUnixField::parser.map(EF::Unix)).parse_next(payload)? + } + ExtraNewUnixField::TAG => { + opt(ExtraNewUnixField::parser.map(EF::NewUnix)).parse_next(payload)? + } + _ => None, + } + .unwrap_or(EF::Unknown { tag: rec.tag }); + + Ok(variant) + } + } +} + +/// 4.5.3 -Zip64 Extended Information Extra Field (0x0001) +#[derive(Clone, Default)] +pub struct ExtraZip64Field { + /// 64-bit uncompressed size + pub uncompressed_size: Option, + + /// 64-bit compressed size + pub compressed_size: Option, + + /// 64-bit header offset + pub header_offset: Option, +} + +impl ExtraZip64Field { + const TAG: u16 = 0x0001; + + pub(crate) fn mk_parser( + settings: ExtraFieldSettings, + ) -> impl FnMut(&mut Partial<&'_ [u8]>) -> PResult { + move |i| { + // N.B: we ignore "disk start number" + seq! {Self { + uncompressed_size: cond(settings.needs_uncompressed_size, le_u64), + compressed_size: cond(settings.needs_compressed_size, le_u64), + header_offset: cond(settings.needs_header_offset, le_u64), + }} + .parse_next(i) + } + } +} + +/// Extended timestamp extra field +#[derive(Clone)] +pub struct ExtraTimestampField { + /// number of seconds since epoch + pub mtime: u32, +} + +impl ExtraTimestampField { + const TAG: u16 = 0x5455; + + fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + preceded( + // 1 byte of flags, if bit 0 is set, modification time is present + le_u8.verify(|x| x & 0b1 != 0), + seq! {Self { mtime: le_u32 }}, + ) + .parse_next(i) + } +} + +/// 4.5.7 -UNIX Extra Field (0x000d): +#[derive(Clone)] +pub struct ExtraUnixField { + /// file last access time + pub atime: u32, + /// file last modification time + pub mtime: u32, + /// file user id + pub uid: u16, + /// file group id + pub gid: u16, + /// variable length data field + pub data: ZipBytes, +} + +impl ExtraUnixField { + const TAG: u16 = 0x000d; + const TAG_INFOZIP: u16 = 0x5855; + + fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + let t_size = le_u16.parse_next(i)? - 12; + seq! {Self { + atime: le_u32, + mtime: le_u32, + uid: le_u16, + gid: le_u16, + data: ZipBytes::parser(t_size), + }} + .parse_next(i) + } +} + +/// Info-ZIP New Unix Extra Field: +/// ==================================== +/// +/// Currently stores Unix UIDs/GIDs up to 32 bits. +/// (Last Revision 20080509) +/// +/// ```text +/// Value Size Description +/// ----- ---- ----------- +/// 0x7875 Short tag for this extra block type ("ux") +/// TSize Short total data size for this block +/// Version 1 byte version of this extra field, currently 1 +/// UIDSize 1 byte Size of UID field +/// UID Variable UID for this entry +/// GIDSize 1 byte Size of GID field +/// GID Variable GID for this entry +/// ``` +#[derive(Clone)] +pub struct ExtraNewUnixField { + /// file user id + pub uid: u64, + + /// file group id + pub gid: u64, +} + +impl ExtraNewUnixField { + const TAG: u16 = 0x7875; + + fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + let _ = tag("\x01").parse_next(i)?; + seq! {Self { + uid: Self::parse_variable_length_integer, + gid: Self::parse_variable_length_integer, + }} + .parse_next(i) + } + + fn parse_variable_length_integer(i: &mut Partial<&'_ [u8]>) -> PResult { + let slice = length_take(le_u8).parse_next(i)?; + if let Some(u) = match slice.len() { + 1 => Some(le_u8.parse_peek(slice)?.1 as u64), + 2 => Some(le_u16.parse_peek(slice)?.1 as u64), + 4 => Some(le_u32.parse_peek(slice)?.1 as u64), + 8 => Some(le_u64.parse_peek(slice)?.1), + _ => None, + } { + Ok(u) + } else { + Err(ErrMode::from_error_kind(i, ErrorKind::Alt)) + } + } +} + +/// 4.5.5 -NTFS Extra Field (0x000a): +#[derive(Clone)] +pub struct ExtraNtfsField { + /// NTFS attributes + pub attrs: Vec, +} + +impl ExtraNtfsField { + const TAG: u16 = 0x000a; + + fn parse(i: &mut Partial<&'_ [u8]>) -> PResult { + let _ = take(4_usize).parse_next(i)?; // reserved (unused) + seq! {Self { + // from the winnow docs: + // Parsers like repeat do not know when an eof is from insufficient + // data or the end of the stream, causing them to always report + // Incomplete. + // using repeat_till with eof combinator to work around this: + attrs: repeat_till(0.., NtfsAttr::parse, winnow::combinator::eof).map(|x| x.0), + }} + .parse_next(i) + } +} + +/// NTFS attribute for zip entries (mostly timestamps) +#[derive(Clone)] +pub enum NtfsAttr { + /// NTFS attribute 1, which contains modified/accessed/created timestamps + Attr1(NtfsAttr1), + + /// Unknown NTFS attribute + Unknown { + /// tag of the attribute + tag: u16, + }, +} + +impl NtfsAttr { + fn parse(i: &mut Partial<&'_ [u8]>) -> PResult { + let tag = le_u16.parse_next(i)?; + trace!("parsing NTFS attribute, tag {:04x}", tag); + let payload = length_take(le_u16).parse_next(i)?; + + match tag { + 0x0001 => NtfsAttr1::parser + .parse_peek(Partial::new(payload)) + .map(|(_, attr)| NtfsAttr::Attr1(attr)), + _ => Ok(NtfsAttr::Unknown { tag }), + } + } +} + +/// NTFS attribute 1, which contains modified/accessed/created timestamps +#[derive(Clone)] +pub struct NtfsAttr1 { + /// modified time + pub mtime: NtfsTimestamp, + + /// accessed time + pub atime: NtfsTimestamp, + + /// created time + pub ctime: NtfsTimestamp, +} + +impl NtfsAttr1 { + fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + trace!("parsing NTFS attr 1, input len is {}", i.len()); + seq! {Self { + mtime: NtfsTimestamp::parser, + atime: NtfsTimestamp::parser, + ctime: NtfsTimestamp::parser, + }} + .parse_next(i) + } +} diff --git a/rc-zip/src/parse/local.rs b/rc-zip/src/parse/local.rs new file mode 100644 index 0000000..fc73ef6 --- /dev/null +++ b/rc-zip/src/parse/local.rs @@ -0,0 +1,214 @@ +use crate::{ + error::{Error, UnsupportedError}, + parse::{Method, MsdosTimestamp, Version, ZipBytes, ZipString}, +}; + +use winnow::{ + binary::{le_u16, le_u32, le_u64, le_u8}, + combinator::opt, + error::{ContextError, ErrMode, ErrorKind, FromExternalError}, + seq, + token::tag, + PResult, Parser, Partial, +}; + +#[derive(Debug)] +/// 4.3.7 Local file header +pub struct LocalFileHeaderRecord { + /// version needed to extract + pub reader_version: Version, + + /// general purpose bit flag + pub flags: u16, + + /// compression method + pub method: Method, + + /// last mod file datetime + pub modified: MsdosTimestamp, + + /// crc-32 + pub crc32: u32, + + /// compressed size + pub compressed_size: u32, + + /// uncompressed size + pub uncompressed_size: u32, + + /// file name + pub name: ZipString, + + /// extra field + pub extra: ZipBytes, + + /// method-specific fields + pub method_specific: MethodSpecific, +} + +#[derive(Debug)] +/// Method-specific properties following the local file header +pub enum MethodSpecific { + /// No method-specific properties + None, + + /// LZMA properties + Lzma(LzmaProperties), +} + +impl LocalFileHeaderRecord { + /// The signature for a local file header + pub const SIGNATURE: &'static str = "PK\x03\x04"; + + /// Parser for the local file header + pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + let _ = tag(Self::SIGNATURE).parse_next(i)?; + + let reader_version = Version::parser.parse_next(i)?; + let flags = le_u16.parse_next(i)?; + let method = le_u16.parse_next(i).map(Method::from)?; + let modified = MsdosTimestamp::parser.parse_next(i)?; + let crc32 = le_u32.parse_next(i)?; + let compressed_size = le_u32.parse_next(i)?; + let uncompressed_size = le_u32.parse_next(i)?; + + let name_len = le_u16.parse_next(i)?; + let extra_len = le_u16.parse_next(i)?; + + let name = ZipString::parser(name_len).parse_next(i)?; + let extra = ZipBytes::parser(extra_len).parse_next(i)?; + + let method_specific = match method { + Method::Lzma => { + let lzma_properties = LzmaProperties::parser.parse_next(i)?; + if let Err(e) = lzma_properties.error_if_unsupported() { + return Err(ErrMode::Cut(ContextError::from_external_error( + i, + ErrorKind::Verify, + e, + ))); + } + MethodSpecific::Lzma(lzma_properties) + } + _ => MethodSpecific::None, + }; + + Ok(Self { + reader_version, + flags, + method, + modified, + crc32, + compressed_size, + uncompressed_size, + name, + extra, + method_specific, + }) + } + + /// Check for the presence of the bit flag that indicates a data descriptor + /// is present after the file data. + pub fn has_data_descriptor(&self) -> bool { + // 4.3.9.1 This descriptor MUST exist if bit 3 of the general + // purpose bit flag is set (see below). + self.flags & 0b1000 != 0 + } +} + +/// 4.3.9 Data descriptor: +#[derive(Debug)] +pub struct DataDescriptorRecord { + /// CRC32 checksum + pub crc32: u32, + /// Compressed size + pub compressed_size: u64, + /// Uncompressed size + pub uncompressed_size: u64, +} + +impl DataDescriptorRecord { + const SIGNATURE: &'static str = "PK\x07\x08"; + + /// Create a parser for the data descriptor record. + pub fn mk_parser(is_zip64: bool) -> impl FnMut(&mut Partial<&'_ [u8]>) -> PResult { + move |i| { + // From appnote.txt: + // + // 4.3.9.3 Although not originally assigned a signature, the value + // 0x08074b50 has commonly been adopted as a signature value for the + // data descriptor record. Implementers SHOULD be aware that ZIP files + // MAY be encountered with or without this signature marking data + // descriptors and SHOULD account for either case when reading ZIP files + // to ensure compatibility. + let _ = opt(tag(Self::SIGNATURE)).parse_next(i)?; + + if is_zip64 { + seq! {Self { + crc32: le_u32, + compressed_size: le_u64, + uncompressed_size: le_u64, + }} + .parse_next(i) + } else { + seq! {Self { + crc32: le_u32, + compressed_size: le_u32.map(|x| x as u64), + uncompressed_size: le_u32.map(|x| x as u64), + }} + .parse_next(i) + } + } + } +} + +/// 5.8.5 LZMA Properties header +#[derive(Debug)] +pub struct LzmaProperties { + /// major version + pub major: u8, + /// minor version + pub minor: u8, + /// properties size + pub properties_size: u16, +} + +impl LzmaProperties { + /// Parser for the LZMA properties header. + pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + // Note: the actual properties (5 bytes, contains dictionary size, + // and various other settings) is not actually read, because lzma-rs + // reads those properties itself. + + seq! {Self { + major: le_u8, + minor: le_u8, + properties_size: le_u16, + }} + .parse_next(i) + } + + /// Check if the LZMA version is supported. + pub fn error_if_unsupported(&self) -> Result<(), Error> { + if (self.major, self.minor) != (2, 0) { + return Err(Error::Unsupported( + UnsupportedError::LzmaVersionUnsupported { + minor: self.minor, + major: self.major, + }, + )); + } + + const LZMA_PROPERTIES_SIZE: u16 = 5; + if self.properties_size != LZMA_PROPERTIES_SIZE { + return Err(Error::Unsupported( + UnsupportedError::LzmaPropertiesHeaderWrongSize { + expected: 5, + actual: self.properties_size, + }, + )); + } + + Ok(()) + } +} diff --git a/rc-zip/src/parse/mod.rs b/rc-zip/src/parse/mod.rs new file mode 100644 index 0000000..962c24e --- /dev/null +++ b/rc-zip/src/parse/mod.rs @@ -0,0 +1,35 @@ +//! Parsers and types for the various elements that make up a ZIP file. +//! +//! Contain winnow parsers for most elements that make up a ZIP file, like the +//! end-of-central-directory record, local file headers, and central directory +//! headers. +//! +//! All parsers here are based off of the PKWARE appnote.txt, which you can find +//! in the source repository. + +mod archive; +pub use archive::*; + +mod extra_field; +pub use extra_field::*; + +mod mode; +pub use mode::*; + +mod version; +pub use version::*; + +mod date_time; +pub use date_time::*; + +mod directory_header; +pub use directory_header::*; + +mod eocd; +pub use eocd::*; + +mod local; +pub use local::*; + +mod raw; +pub use raw::*; diff --git a/rc-zip/src/parse/mode.rs b/rc-zip/src/parse/mode.rs new file mode 100644 index 0000000..9185eec --- /dev/null +++ b/rc-zip/src/parse/mode.rs @@ -0,0 +1,266 @@ +use std::fmt; + +/// Mode represents a file's mode and permission bits. +/// The bits have the same definition on all systems, +/// but not all bits apply to all systems. +/// +/// It is modelled after Go's `os.FileMode`. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct Mode(pub u32); + +impl Mode { + /// d: is a directory + pub const DIR: Self = Self(1 << 31); + /// a: append-only + pub const APPEND: Self = Self(1 << 30); + /// l: exclusive use + pub const EXCLUSIVE: Self = Self(1 << 29); + /// T: temporary file; Plan 9 only + pub const TEMPORARY: Self = Self(1 << 28); + /// L: symbolic link + pub const SYMLINK: Self = Self(1 << 27); + /// D: device file + pub const DEVICE: Self = Self(1 << 26); + /// p: named pipe (FIFO) + pub const NAMED_PIPE: Self = Self(1 << 25); + /// S: Unix domain socket + pub const SOCKET: Self = Self(1 << 24); + /// u: setuid + pub const SETUID: Self = Self(1 << 23); + /// g: setgid + pub const SETGID: Self = Self(1 << 22); + /// c: Unix character device, when DEVICE is set + pub const CHAR_DEVICE: Self = Self(1 << 21); + /// t: sticky + pub const STICKY: Self = Self(1 << 20); + /// ?: non-regular file; nothing else is known + pub const IRREGULAR: Self = Self(1 << 19); +} + +impl fmt::Display for Mode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut w = 0; + if self.has(Self::DIR) { + write!(f, "d")?; + w += 1; + } + if self.has(Self::APPEND) { + write!(f, "a")?; + w += 1; + } + if self.has(Self::EXCLUSIVE) { + write!(f, "l")?; + w += 1; + } + if self.has(Self::TEMPORARY) { + write!(f, "T")?; + w += 1; + } + if self.has(Self::SYMLINK) { + write!(f, "L")?; + w += 1; + } + if self.has(Self::DEVICE) { + write!(f, "D")?; + w += 1; + } + if self.has(Self::NAMED_PIPE) { + write!(f, "p")?; + w += 1; + } + if self.has(Self::SOCKET) { + write!(f, "S")?; + w += 1; + } + if self.has(Self::SETUID) { + write!(f, "u")?; + w += 1; + } + if self.has(Self::SETGID) { + write!(f, "g")?; + w += 1; + } + if self.has(Self::CHAR_DEVICE) { + write!(f, "c")?; + w += 1; + } + if self.has(Self::STICKY) { + write!(f, "t")?; + w += 1; + } + if self.has(Self::IRREGULAR) { + write!(f, "?")?; + w += 1; + } + if w == 0 { + write!(f, "-")?; + } + + let rwx = "rwxrwxrwx"; + for (i, c) in rwx.char_indices() { + if self.has(Mode(1 << (9 - 1 - i))) { + write!(f, "{}", c)?; + } else { + write!(f, "-")?; + } + } + + Ok(()) + } +} + +impl From for Mode { + fn from(m: UnixMode) -> Self { + let mut mode = Mode(m.0 & 0o777); + + match m & UnixMode::IFMT { + UnixMode::IFBLK => mode |= Mode::DEVICE, + UnixMode::IFCHR => mode |= Mode::DEVICE & Mode::CHAR_DEVICE, + UnixMode::IFDIR => mode |= Mode::DIR, + UnixMode::IFIFO => mode |= Mode::NAMED_PIPE, + UnixMode::IFLNK => mode |= Mode::SYMLINK, + UnixMode::IFREG => { /* nothing to do */ } + UnixMode::IFSOCK => mode |= Mode::SOCKET, + _ => {} + } + + if m.has(UnixMode::ISGID) { + mode |= Mode::SETGID + } + if m.has(UnixMode::ISUID) { + mode |= Mode::SETUID + } + if m.has(UnixMode::ISVTX) { + mode |= Mode::STICKY + } + + mode + } +} + +impl From for Mode { + fn from(m: MsdosMode) -> Self { + let mut mode = if m.has(MsdosMode::DIR) { + Mode::DIR | Mode(0o777) + } else { + Mode(0o666) + }; + if m.has(MsdosMode::READ_ONLY) { + mode &= Mode(0o222); + } + + mode + } +} + +impl From for Mode { + fn from(u: u32) -> Self { + Mode(u) + } +} + +/// UnixMode represents the file mode and permission bits for Unix systems. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct UnixMode(pub u32); + +impl UnixMode { + /// bit mask for the file type bit fields + pub const IFMT: Self = Self(0xf000); + + /// the file is a socket + pub const IFSOCK: Self = Self(0xc000); + + /// the file is a symbolic link + pub const IFLNK: Self = Self(0xa000); + + /// the file is a regular file + pub const IFREG: Self = Self(0x8000); + + /// the file is a block device + pub const IFBLK: Self = Self(0x6000); + + /// the file is a directory + pub const IFDIR: Self = Self(0x4000); + + /// the file is a character device + pub const IFCHR: Self = Self(0x2000); + + /// the file is a FIFO + pub const IFIFO: Self = Self(0x1000); + + /// the file is set-user-ID + pub const ISUID: Self = Self(0x800); + + /// the file is set-group-ID + pub const ISGID: Self = Self(0x400); + + /// the file is sticky + pub const ISVTX: Self = Self(0x200); +} + +impl From for UnixMode { + fn from(u: u32) -> Self { + UnixMode(u) + } +} + +/// MsdosMode represents the file mode and permission bits for MS-DOS +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct MsdosMode(pub u32); + +impl MsdosMode { + /// the file is a directory + pub const DIR: Self = Self(0x10); + + /// the file is read-only + pub const READ_ONLY: Self = Self(0x01); +} + +impl From for MsdosMode { + fn from(u: u32) -> Self { + MsdosMode(u) + } +} + +macro_rules! derive_bitops { + ($T: ty) => { + impl std::ops::BitOr for $T { + type Output = Self; + + fn bitor(self, rhs: Self) -> Self { + Self(self.0 | rhs.0) + } + } + + impl std::ops::BitOrAssign for $T { + fn bitor_assign(&mut self, rhs: Self) { + self.0 |= rhs.0; + } + } + + impl std::ops::BitAnd for $T { + type Output = Self; + + fn bitand(self, rhs: Self) -> Self { + Self(self.0 & rhs.0) + } + } + + impl std::ops::BitAndAssign for $T { + fn bitand_assign(&mut self, rhs: Self) { + self.0 &= rhs.0; + } + } + + impl $T { + /// Check if the mode has the given bits set. + pub fn has(&self, rhs: Self) -> bool { + self.0 & rhs.0 != 0 + } + } + }; +} + +derive_bitops!(Mode); +derive_bitops!(UnixMode); +derive_bitops!(MsdosMode); diff --git a/rc-zip/src/parse/raw.rs b/rc-zip/src/parse/raw.rs new file mode 100644 index 0000000..fb978ab --- /dev/null +++ b/rc-zip/src/parse/raw.rs @@ -0,0 +1,77 @@ +use pretty_hex::PrettyHex; +use std::fmt; +use winnow::{stream::ToUsize, token::take, PResult, Parser, Partial}; + +/// A raw zip string, with no specific encoding. +/// +/// This is used while parsing a zip archive's central directory, +/// before we know what encoding is used. +#[derive(Clone)] +pub struct ZipString(pub Vec); + +impl<'a> From<&'a [u8]> for ZipString { + fn from(slice: &'a [u8]) -> Self { + Self(slice.into()) + } +} + +impl fmt::Debug for ZipString { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match std::str::from_utf8(&self.0) { + Ok(s) => write!(f, "{:?}", s), + Err(_) => write!(f, "[non-utf8 string: {}]", self.0.hex_dump()), + } + } +} + +impl ZipString { + pub(crate) fn parser(count: C) -> impl FnMut(&mut Partial<&'_ [u8]>) -> PResult + where + C: ToUsize, + { + let count = count.to_usize(); + move |i| (take(count).map(|slice: &[u8]| Self(slice.into()))).parse_next(i) + } + + pub(crate) fn into_option(self) -> Option { + if !self.0.is_empty() { + Some(self) + } else { + None + } + } +} + +/// A raw u8 slice, with no specific structure. +/// +/// This is used while parsing a zip archive, when we want +/// to retain an owned slice to be parsed later. +#[derive(Clone)] +pub struct ZipBytes(pub Vec); + +impl fmt::Debug for ZipBytes { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + const MAX_SHOWN_SIZE: usize = 10; + let data = &self.0[..]; + let (slice, extra) = if data.len() > MAX_SHOWN_SIZE { + (&self.0[..MAX_SHOWN_SIZE], Some(data.len() - MAX_SHOWN_SIZE)) + } else { + (&self.0[..], None) + }; + write!(f, "{}", slice.hex_dump())?; + if let Some(extra) = extra { + write!(f, " (+ {} bytes)", extra)?; + } + Ok(()) + } +} + +impl ZipBytes { + pub(crate) fn parser(count: C) -> impl FnMut(&mut Partial<&'_ [u8]>) -> PResult + where + C: ToUsize, + { + let count = count.to_usize(); + move |i| (take(count).map(|slice: &[u8]| Self(slice.into()))).parse_next(i) + } +} diff --git a/rc-zip/src/parse/version.rs b/rc-zip/src/parse/version.rs new file mode 100644 index 0000000..1b9ac8f --- /dev/null +++ b/rc-zip/src/parse/version.rs @@ -0,0 +1,133 @@ +use std::fmt; +use winnow::{binary::le_u16, PResult, Parser, Partial}; + +/// A zip version (either created by, or required when reading an archive). +/// +/// Versions determine which features are supported by a tool, and +/// which features are required when reading a file. +/// +/// For more information, see the [.ZIP Application Note](https://support.pkware.com/display/PKZIP/APPNOTE), section 4.4.2. +#[derive(Clone, Copy, PartialEq, Eq, Hash)] +pub struct Version(pub u16); + +impl fmt::Debug for Version { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "{:?} v{}.{}", + self.host_system(), + self.major(), + self.minor() + ) + } +} + +impl Version { + /// Parse a version from a byte slice + pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + le_u16.map(Self).parse_next(i) + } + + /// Identifies the host system on which the zip attributes are compatible. + pub fn host_system(&self) -> HostSystem { + match self.host() { + 0 => HostSystem::MsDos, + 1 => HostSystem::Amiga, + 2 => HostSystem::OpenVms, + 3 => HostSystem::Unix, + 4 => HostSystem::VmCms, + 5 => HostSystem::AtariSt, + 6 => HostSystem::Os2Hpfs, + 7 => HostSystem::Macintosh, + 8 => HostSystem::ZSystem, + 9 => HostSystem::CpM, + 10 => HostSystem::WindowsNtfs, + 11 => HostSystem::Mvs, + 12 => HostSystem::Vse, + 13 => HostSystem::AcornRisc, + 14 => HostSystem::Vfat, + 15 => HostSystem::AlternateMvs, + 16 => HostSystem::BeOs, + 17 => HostSystem::Tandem, + 18 => HostSystem::Os400, + 19 => HostSystem::Osx, + n => HostSystem::Unknown(n), + } + } + + /// Integer host system + pub fn host(&self) -> u8 { + (self.0 >> 8) as u8 + } + + /// Integer version, e.g. 45 for Zip version 4.5 + pub fn version(&self) -> u8 { + (self.0 & 0xff) as u8 + } + + /// ZIP specification major version + /// + /// See APPNOTE, section 4.4.2.1 + pub fn major(&self) -> u32 { + self.version() as u32 / 10 + } + + /// ZIP specification minor version + /// + /// See APPNOTE, section 4.4.2.1 + pub fn minor(&self) -> u32 { + self.version() as u32 % 10 + } +} + +/// System on which an archive was created, as encoded into a version u16. +/// +/// See APPNOTE, section 4.4.2.2 +#[derive(Debug)] +pub enum HostSystem { + /// MS-DOS and OS/2 (FAT / VFAT / FAT32 file systems) + MsDos, + /// Amiga + Amiga, + /// OpenVMS + OpenVms, + /// UNIX + Unix, + /// VM/CMS + VmCms, + /// Atari ST + AtariSt, + /// OS/2 H.P.F.S + Os2Hpfs, + /// Macintosh (see `Osx`) + Macintosh, + /// Z-System + ZSystem, + /// CP/M + CpM, + /// Windows NTFS + WindowsNtfs, + /// MVS (OS/390 - Z/OS) + Mvs, + /// VSE + Vse, + /// Acorn Risc + AcornRisc, + /// VFAT + Vfat, + /// alternate MVS + AlternateMvs, + /// BeOS + BeOs, + /// Tandem + Tandem, + /// OS/400 + Os400, + /// OS X (Darwin) + Osx, + /// Unknown host system + /// + /// Values 20 through 255 are currently unused, as of + /// APPNOTE.TXT 6.3.6 (April 26, 2019) + Unknown(u8), +} diff --git a/rc-zip/tests/integration_tests.rs b/rc-zip/tests/integration_tests.rs new file mode 100644 index 0000000..8078b1c --- /dev/null +++ b/rc-zip/tests/integration_tests.rs @@ -0,0 +1,48 @@ +use std::cmp; + +use rc_zip::{ + corpus, + fsm::{ArchiveFsm, FsmResult}, +}; + +#[test_log::test] +fn state_machine() { + let cases = corpus::test_cases(); + let case = cases.iter().find(|x| x.name == "zip64.zip").unwrap(); + let bs = std::fs::read(case.absolute_path()).unwrap(); + let mut fsm = ArchiveFsm::new(bs.len() as u64); + + let archive = 'read_zip: loop { + if let Some(offset) = fsm.wants_read() { + let increment = 128usize; + let offset = offset as usize; + let slice = if offset + increment > bs.len() { + &bs[offset..] + } else { + &bs[offset..offset + increment] + }; + + let len = cmp::min(slice.len(), fsm.space().len()); + fsm.space()[..len].copy_from_slice(&slice[..len]); + match len { + 0 => panic!("EOF!"), + read_bytes => { + fsm.fill(read_bytes); + } + } + } + + fsm = match fsm.process() { + Ok(res) => match res { + FsmResult::Continue(fsm) => fsm, + FsmResult::Done(archive) => break 'read_zip archive, + }, + Err(err) => { + panic!("{}", err) + } + } + }; + + // cool, we have the archive + let _ = archive; +} diff --git a/src/lib.rs b/src/lib.rs deleted file mode 100644 index 9199643..0000000 --- a/src/lib.rs +++ /dev/null @@ -1,26 +0,0 @@ -//! # rc-zip -//! -//! rc-zip is a zip archive library with a focus on compatibility and correctness. -//! -//! ### Reading -//! -//! [ArchiveReader](reader::ArchiveReader) is your first stop. It -//! ensures we are dealing with a valid zip archive, and reads the central -//! directory. It does not perform I/O itself, but rather, it is a state machine -//! that asks for reads at specific offsets. -//! -//! An [Archive] contains a full list of [entries](StoredEntry), -//! which you can then extract. -//! -//! ### Writing -//! -//! Writing archives is not implemented yet. -//! - -mod encoding; -mod error; -mod format; -pub mod prelude; -pub mod reader; - -pub use self::{error::*, format::*}; diff --git a/src/prelude.rs b/src/prelude.rs deleted file mode 100644 index 21b88d1..0000000 --- a/src/prelude.rs +++ /dev/null @@ -1,4 +0,0 @@ -//! Prelude for rc-zip - -#[cfg(feature = "sync")] -pub use crate::reader::sync::{ReadZip, ReadZipWithSize}; diff --git a/src/reader/buffer.rs b/src/reader/buffer.rs deleted file mode 100644 index 7e2fda9..0000000 --- a/src/reader/buffer.rs +++ /dev/null @@ -1,72 +0,0 @@ -use std::io::Read; - -use tracing::trace; - -/// A wrapper around [oval::Buffer] that keeps track of how many bytes we've read since -/// initialization or the last reset. -pub(crate) struct Buffer { - pub(crate) buffer: oval::Buffer, - pub(crate) read_bytes: u64, -} - -impl Buffer { - /// creates a new buffer with the specified capacity - pub(crate) fn with_capacity(size: usize) -> Self { - Self { - buffer: oval::Buffer::with_capacity(size), - read_bytes: 0, - } - } - - /// resets the buffer (so that data() returns an empty slice, - /// and space() returns the full capacity), along with th e - /// read bytes counter. - pub(crate) fn reset(&mut self) { - self.read_bytes = 0; - self.buffer.reset(); - } - - /// returns the number of read bytes since the last reset - pub(crate) fn read_bytes(&self) -> u64 { - self.read_bytes - } - - /// returns a slice with all the available data - pub(crate) fn data(&self) -> &[u8] { - self.buffer.data() - } - - /// returns how much data can be read from the buffer - pub(crate) fn available_data(&self) -> usize { - self.buffer.available_data() - } - - /// advances the position tracker - /// - /// if the position gets past the buffer's half, - /// this will call `shift()` to move the remaining data - /// to the beginning of the buffer - pub(crate) fn consume(&mut self, count: usize) -> usize { - self.buffer.consume(count) - } - - /// fill that buffer from the given Read - pub(crate) fn read(&mut self, rd: &mut dyn Read) -> Result { - if self.buffer.available_space() == 0 { - trace!("uh oh, buffer has no available space!") - } - - match rd.read(self.buffer.space()) { - Ok(written) => { - self.read_bytes += written as u64; - self.buffer.fill(written); - Ok(written) - } - Err(e) => Err(e), - } - } - - pub(crate) fn read_offset(&self, offset: u64) -> u64 { - self.read_bytes + offset - } -} diff --git a/src/reader/macros.rs b/src/reader/macros.rs deleted file mode 100644 index 44f6394..0000000 --- a/src/reader/macros.rs +++ /dev/null @@ -1,10 +0,0 @@ -#[macro_export] -macro_rules! transition { - ($state: expr => ($pattern: pat) $body: expr) => { - $state = if let $pattern = std::mem::replace(&mut $state, S::Transitioning) { - $body - } else { - unreachable!() - }; - }; -} diff --git a/src/reader/mod.rs b/src/reader/mod.rs deleted file mode 100644 index c2def34..0000000 --- a/src/reader/mod.rs +++ /dev/null @@ -1,8 +0,0 @@ -mod buffer; -mod macros; - -mod archive_reader; -pub use self::archive_reader::{ArchiveReader, ArchiveReaderResult}; - -#[cfg(feature = "sync")] -pub mod sync; diff --git a/src/reader/sync/decoder.rs b/src/reader/sync/decoder.rs deleted file mode 100644 index fca6e56..0000000 --- a/src/reader/sync/decoder.rs +++ /dev/null @@ -1,100 +0,0 @@ -use std::{cmp, io}; - -use oval::Buffer; - -pub trait Decoder: io::Read -where - R: io::Read, -{ - /// Moves the inner reader out of this decoder. - /// self is boxed because decoders are typically used as trait objects. - fn into_inner(self: Box) -> R; - - /// Returns a mutable reference to the inner reader. - fn get_mut(&mut self) -> &mut R; -} - -pub struct StoreDecoder -where - R: io::Read, -{ - inner: R, -} - -impl StoreDecoder -where - R: io::Read, -{ - pub fn new(inner: R) -> Self { - Self { inner } - } -} - -impl io::Read for StoreDecoder -where - R: io::Read, -{ - fn read(&mut self, buf: &mut [u8]) -> io::Result { - self.inner.read(buf) - } -} - -impl Decoder for StoreDecoder -where - R: io::Read, -{ - fn into_inner(self: Box) -> R { - self.inner - } - - fn get_mut(&mut self) -> &mut R { - &mut self.inner - } -} - -/// Only allows reading a fixed number of bytes from a [oval::Buffer], -/// allowing to move the inner reader out afterwards. -pub struct RawEntryReader { - remaining: u64, - inner: Buffer, -} - -impl RawEntryReader { - pub fn new(inner: Buffer, remaining: u64) -> Self { - Self { inner, remaining } - } - - pub fn into_inner(self) -> Buffer { - self.inner - } - - pub fn get_mut(&mut self) -> &mut Buffer { - &mut self.inner - } -} - -impl io::BufRead for RawEntryReader { - fn fill_buf(&mut self) -> io::Result<&[u8]> { - let max_avail = cmp::min(self.remaining, self.inner.available_data() as u64); - Ok(&self.inner.data()[..max_avail as usize]) - } - - fn consume(&mut self, amt: usize) { - self.remaining -= amt as u64; - Buffer::consume(&mut self.inner, amt); - } -} - -impl io::Read for RawEntryReader { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - let len = cmp::min(buf.len() as u64, self.remaining) as usize; - tracing::trace!(%len, buf_len = buf.len(), remaining = self.remaining, available_data = self.inner.available_data(), available_space = self.inner.available_space(), "computing len"); - - let res = self.inner.read(&mut buf[..len]); - if let Ok(n) = res { - tracing::trace!(%n, "read ok"); - self.remaining -= n as u64; - } - res - } -} diff --git a/src/reader/sync/entry_reader/bzip2_dec.rs b/src/reader/sync/entry_reader/bzip2_dec.rs deleted file mode 100644 index 238427f..0000000 --- a/src/reader/sync/entry_reader/bzip2_dec.rs +++ /dev/null @@ -1,22 +0,0 @@ -use std::io::Read; - -use bzip2::read::BzDecoder; - -use crate::reader::sync::{Decoder, RawEntryReader}; - -impl Decoder for BzDecoder -where - R: Read, -{ - fn into_inner(self: Box) -> R { - Self::into_inner(*self) - } - - fn get_mut(&mut self) -> &mut R { - Self::get_mut(self) - } -} - -pub(crate) fn mk_decoder(r: RawEntryReader) -> impl Decoder { - BzDecoder::new(r) -} diff --git a/src/reader/sync/entry_reader/deflate64_dec.rs b/src/reader/sync/entry_reader/deflate64_dec.rs deleted file mode 100644 index cb1fdd0..0000000 --- a/src/reader/sync/entry_reader/deflate64_dec.rs +++ /dev/null @@ -1,22 +0,0 @@ -use std::io::{BufReader, Read}; - -use deflate64::Deflate64Decoder; - -use crate::reader::sync::{Decoder, RawEntryReader}; - -impl Decoder for Deflate64Decoder> -where - R: Read, -{ - fn into_inner(self: Box) -> R { - Self::into_inner(*self).into_inner() - } - - fn get_mut(&mut self) -> &mut R { - Self::get_mut(self).get_mut() - } -} - -pub(crate) fn mk_decoder(r: RawEntryReader) -> impl Decoder { - Deflate64Decoder::new(r) -} diff --git a/src/reader/sync/entry_reader/deflate_dec.rs b/src/reader/sync/entry_reader/deflate_dec.rs deleted file mode 100644 index dfa6495..0000000 --- a/src/reader/sync/entry_reader/deflate_dec.rs +++ /dev/null @@ -1,22 +0,0 @@ -use std::io::Read; - -use flate2::read::DeflateDecoder; - -use crate::reader::sync::{Decoder, RawEntryReader}; - -impl Decoder for DeflateDecoder -where - R: Read, -{ - fn into_inner(self: Box) -> R { - Self::into_inner(*self) - } - - fn get_mut(&mut self) -> &mut R { - Self::get_mut(self) - } -} - -pub(crate) fn mk_decoder(r: RawEntryReader) -> impl Decoder { - DeflateDecoder::new(r) -} diff --git a/src/reader/sync/entry_reader/lzma_dec.rs b/src/reader/sync/entry_reader/lzma_dec.rs deleted file mode 100644 index 1898ebf..0000000 --- a/src/reader/sync/entry_reader/lzma_dec.rs +++ /dev/null @@ -1,150 +0,0 @@ -use lzma_rs::decompress::Stream; -use std::io::{Read, Write}; - -use crate::{ - reader::sync::{Decoder, RawEntryReader}, - Error, UnsupportedError, -}; - -enum LzmaDecoderState { - Writing(Box>>), - Draining(Vec), - Transition, -} -struct LzmaDecoderAdapter { - input: R, - total_write_count: u64, - state: LzmaDecoderState, - read_buf: Vec, -} - -impl Read for LzmaDecoderAdapter -where - R: Read, -{ - fn read(&mut self, buf: &mut [u8]) -> std::io::Result { - let mut state = LzmaDecoderState::Transition; - std::mem::swap(&mut state, &mut self.state); - - match state { - LzmaDecoderState::Writing(mut stream) => { - let bytes_read = self.input.read(&mut self.read_buf)?; - if bytes_read == 0 { - // we're EOF: finish and move on to draining - self.state = LzmaDecoderState::Draining(stream.finish()?); - // and recurse - return self.read(buf); - } - - if let Err(e) = stream.write_all(&self.read_buf[..bytes_read]) { - if e.kind() == std::io::ErrorKind::WriteZero { - // that's expected actually! from the lzma-rs tests: - // - // A WriteZero error may occur if decompression is finished but there - // are remaining `compressed` bytes to write. - // This is the case when the unpacked size is encoded as unknown but - // provided when decoding. I.e. the 5 or 6 byte end-of-stream marker - // is not read. - - // finish and move on to draining - self.state = LzmaDecoderState::Draining(stream.finish()?); - // and recurse - return self.read(buf); - } else { - return Err(e); - } - } - - self.state = LzmaDecoderState::Writing(stream); - } - LzmaDecoderState::Draining(vec) => { - // nothing more to decode, we just need to empty our - // internal buffer - self.state = LzmaDecoderState::Draining(vec); - } - LzmaDecoderState::Transition => { - unreachable!() - } - }; - - let write_buf = match &mut self.state { - LzmaDecoderState::Writing(stream) => stream.get_output_mut().unwrap(), - LzmaDecoderState::Draining(vec) => vec, - LzmaDecoderState::Transition => unreachable!(), - }; - let write_count = std::cmp::min(buf.len(), write_buf.len()); - { - let src_slice = &write_buf[..write_count]; - let dst_slice = &mut buf[..write_count]; - dst_slice.copy_from_slice(src_slice); - } - - // copy the remaining bytes to the front of the buffer - write_buf.rotate_left(write_count); - write_buf.truncate(write_buf.len() - write_count); - - self.total_write_count += write_count as u64; - Ok(write_count) - } -} - -impl Decoder for LzmaDecoderAdapter -where - R: Read, -{ - fn into_inner(self: Box) -> R { - self.input - } - - fn get_mut(&mut self) -> &mut R { - &mut self.input - } -} - -pub(crate) fn mk_decoder( - mut r: RawEntryReader, - uncompressed_size: u64, -) -> std::io::Result> { - use byteorder::{LittleEndian, ReadBytesExt}; - - // see `appnote.txt` section 5.8 - - // major & minor version are each 1 byte - let major = r.read_u8()?; - let minor = r.read_u8()?; - - // properties size is a 2-byte little-endian integer - let properties_size = r.read_u16::()?; - - if (major, minor) != (2, 0) { - return Err( - Error::Unsupported(UnsupportedError::LzmaVersionUnsupported { minor, major }).into(), - ); - } - - const LZMA_PROPERTIES_SIZE: u16 = 5; - if properties_size != LZMA_PROPERTIES_SIZE { - return Err( - Error::Unsupported(UnsupportedError::LzmaPropertiesHeaderWrongSize { - expected: 5, - actual: properties_size, - }) - .into(), - ); - } - - let memlimit = 128 * 1024 * 1024; - let opts = lzma_rs::decompress::Options { - unpacked_size: lzma_rs::decompress::UnpackedSize::UseProvided(Some(uncompressed_size)), - allow_incomplete: false, - memlimit: Some(memlimit), - }; - - let stream = Stream::new_with_options(&opts, vec![]); - Ok(LzmaDecoderAdapter { - input: r, - total_write_count: 0, - state: LzmaDecoderState::Writing(Box::new(stream)), - read_buf: vec![0u8; 8192], - }) -} diff --git a/src/reader/sync/entry_reader/mod.rs b/src/reader/sync/entry_reader/mod.rs deleted file mode 100644 index a46e118..0000000 --- a/src/reader/sync/entry_reader/mod.rs +++ /dev/null @@ -1,327 +0,0 @@ -//! This part of the API is still being designed - no guarantees are made -//! whatsoever. -use crate::{ - error::*, - format::*, - reader::sync::decoder::{Decoder, RawEntryReader, StoreDecoder}, - transition, -}; - -#[cfg(feature = "lzma")] -mod lzma_dec; - -#[cfg(feature = "deflate")] -mod deflate_dec; - -#[cfg(feature = "deflate64")] -mod deflate64_dec; - -#[cfg(feature = "bzip2")] -mod bzip2_dec; - -#[cfg(feature = "zstd")] -mod zstd_dec; - -use cfg_if::cfg_if; -use oval::Buffer; -use std::io; -use tracing::trace; -use winnow::{ - error::ErrMode, - stream::{AsBytes, Offset}, - Parser, Partial, -}; - -struct EntryReadMetrics { - uncompressed_size: u64, - crc32: u32, -} - -enum State { - ReadLocalHeader { - buffer: Buffer, - }, - ReadData { - hasher: crc32fast::Hasher, - uncompressed_size: u64, - header: LocalFileHeaderRecord, - decoder: Box>, - }, - ReadDataDescriptor { - metrics: EntryReadMetrics, - header: LocalFileHeaderRecord, - buffer: Buffer, - }, - Validate { - metrics: EntryReadMetrics, - header: LocalFileHeaderRecord, - descriptor: Option, - }, - Done, - Transitioning, -} - -pub struct EntryReader -where - R: io::Read, -{ - rd: R, - eof: bool, - state: State, - inner: StoredEntryInner, - method: Method, -} - -impl io::Read for EntryReader -where - R: io::Read, -{ - fn read(&mut self, buf: &mut [u8]) -> io::Result { - use State as S; - match self.state { - S::ReadLocalHeader { ref mut buffer } => { - // FIXME: if this returns less than the size of LocalFileHeader, we'll error out - let read_bytes = self.rd.read(buffer.space())?; - buffer.fill(read_bytes); - - let mut input = Partial::new(buffer.data()); - match LocalFileHeaderRecord::parser.parse_next(&mut input) { - Ok(header) => { - buffer.consume(input.as_bytes().offset_from(&buffer.data())); - - trace!("local file header: {:#?}", header); - transition!(self.state => (S::ReadLocalHeader { buffer }) { - // allow unnecessary mut for some feature combinations - #[allow(unused_mut)] - let mut limited_reader = RawEntryReader::new(buffer, self.inner.compressed_size); - let decoder: Box> = self.get_decoder(limited_reader)?; - - S::ReadData { - hasher: crc32fast::Hasher::new(), - uncompressed_size: 0, - decoder, - header, - } - }); - self.read(buf) - } - Err(ErrMode::Incomplete(_)) => self.read(buf), - Err(_e) => Err(Error::Format(FormatError::InvalidLocalHeader).into()), - } - } - S::ReadData { - ref mut uncompressed_size, - ref mut decoder, - ref mut hasher, - .. - } => { - { - let buffer = decoder.get_mut().get_mut(); - if !self.eof && buffer.available_data() == 0 { - if buffer.available_space() == 0 { - buffer.shift(); - } - - match self.rd.read(buffer.space())? { - 0 => { - self.eof = true; - } - n => { - buffer.fill(n); - } - } - } - } - match decoder.read(buf) { - Ok(0) => { - transition!(self.state => (S::ReadData { decoder, header, hasher, uncompressed_size, .. }) { - let limited_reader = decoder.into_inner(); - let buffer = limited_reader.into_inner(); - let metrics = EntryReadMetrics { - crc32: hasher.finalize(), - uncompressed_size, - }; - if header.has_data_descriptor() { - trace!("will read data descriptor (flags = {:x})", header.flags); - S::ReadDataDescriptor { metrics, buffer, header } - } else { - trace!("no data descriptor to read"); - S::Validate { metrics, header, descriptor: None } - } - }); - self.read(buf) - } - Ok(n) => { - *uncompressed_size += n as u64; - hasher.update(&buf[..n]); - Ok(n) - } - Err(e) => match e.kind() { - io::ErrorKind::UnexpectedEof => { - let buffer = decoder.get_mut().get_mut(); - if self.eof || buffer.available_space() == 0 { - Err(e) - } else { - self.read(buf) - } - } - _ => Err(e), - }, - } - } - S::ReadDataDescriptor { ref mut buffer, .. } => { - trace!( - "read data descriptor, avail data = {}, avail space = {}", - buffer.available_data(), - buffer.available_space() - ); - - let mut input = Partial::new(buffer.data()); - match DataDescriptorRecord::mk_parser(self.inner.is_zip64).parse_next(&mut input) { - Ok(descriptor) => { - buffer.consume(input.as_bytes().offset_from(&buffer.data())); - trace!("data descriptor = {:#?}", descriptor); - transition!(self.state => (S::ReadDataDescriptor { metrics, header, .. }) { - S::Validate { metrics, header, descriptor: Some(descriptor) } - }); - self.read(buf) - } - Err(ErrMode::Incomplete(_)) => { - let n = self.rd.read(buffer.space())?; - if n == 0 { - return Err(io::ErrorKind::UnexpectedEof.into()); - } - buffer.fill(n); - trace!("filled {}", n); - - self.read(buf) - } - Err(_e) => Err(Error::Format(FormatError::InvalidLocalHeader).into()), - } - } - S::Validate { - ref metrics, - ref header, - ref descriptor, - } => { - let expected_crc32 = if self.inner.crc32 != 0 { - self.inner.crc32 - } else if let Some(descriptor) = descriptor.as_ref() { - descriptor.crc32 - } else { - header.crc32 - }; - - let expected_size = if self.inner.uncompressed_size != 0 { - self.inner.uncompressed_size - } else if let Some(descriptor) = descriptor.as_ref() { - descriptor.uncompressed_size - } else { - header.uncompressed_size as u64 - }; - - if expected_size != metrics.uncompressed_size { - return Err(Error::Format(FormatError::WrongSize { - expected: expected_size, - actual: metrics.uncompressed_size, - }) - .into()); - } - - if expected_crc32 != 0 && expected_crc32 != metrics.crc32 { - return Err(Error::Format(FormatError::WrongChecksum { - expected: expected_crc32, - actual: metrics.crc32, - }) - .into()); - } - - self.state = S::Done; - self.read(buf) - } - S::Done => Ok(0), - S::Transitioning => unreachable!(), - } - } -} - -impl EntryReader -where - R: io::Read, -{ - const DEFAULT_BUFFER_SIZE: usize = 256 * 1024; - - pub fn new(entry: &StoredEntry, get_reader: F) -> Self - where - F: Fn(u64) -> R, - { - Self { - rd: get_reader(entry.header_offset), - eof: false, - state: State::ReadLocalHeader { - buffer: Buffer::with_capacity(Self::DEFAULT_BUFFER_SIZE), - }, - method: entry.method(), - inner: entry.inner, - } - } - - fn get_decoder( - &self, - #[allow(unused_mut)] mut raw_r: RawEntryReader, - ) -> Result>, Error> { - let decoder: Box> = match self.method { - Method::Store => Box::new(StoreDecoder::new(raw_r)), - Method::Deflate => { - cfg_if! { - if #[cfg(feature = "deflate")] { - Box::new(deflate_dec::mk_decoder(raw_r)) - } else { - return Err(Error::method_not_enabled(self.method)); - } - } - } - Method::Deflate64 => { - cfg_if! { - if #[cfg(feature = "deflate64")] { - Box::new(deflate64_dec::mk_decoder(raw_r)) - } else { - return Err(Error::method_not_enabled(self.method)); - } - } - } - Method::Lzma => { - cfg_if! { - if #[cfg(feature = "lzma")] { - Box::new(lzma_dec::mk_decoder(raw_r,self.inner.uncompressed_size)?) - } else { - return Err(Error::method_not_enabled(self.method)); - } - } - } - Method::Bzip2 => { - cfg_if! { - if #[cfg(feature = "bzip2")] { - Box::new(bzip2_dec::mk_decoder(raw_r)) - } else { - return Err(Error::method_not_enabled(self.method)); - } - } - } - Method::Zstd => { - cfg_if! { - if #[cfg(feature = "zstd")] { - Box::new(zstd_dec::mk_decoder(raw_r)?) - } else { - return Err(Error::method_not_enabled(self.method)); - } - } - } - method => { - return Err(Error::method_not_supported(method)); - } - }; - - Ok(decoder) - } -} diff --git a/src/reader/sync/entry_reader/zstd_dec.rs b/src/reader/sync/entry_reader/zstd_dec.rs deleted file mode 100644 index 014f672..0000000 --- a/src/reader/sync/entry_reader/zstd_dec.rs +++ /dev/null @@ -1,22 +0,0 @@ -use std::io::{BufRead, Read}; - -use zstd::stream::Decoder as ZstdDecoder; - -use crate::reader::sync::{Decoder, RawEntryReader}; - -impl Decoder for ZstdDecoder<'static, R> -where - R: Read + BufRead, -{ - fn into_inner(self: Box) -> R { - Self::finish(*self) - } - - fn get_mut(&mut self) -> &mut R { - Self::get_mut(self) - } -} - -pub(crate) fn mk_decoder(r: RawEntryReader) -> std::io::Result> { - ZstdDecoder::with_buffer(r) -} diff --git a/src/reader/sync/mod.rs b/src/reader/sync/mod.rs deleted file mode 100644 index a0708db..0000000 --- a/src/reader/sync/mod.rs +++ /dev/null @@ -1,8 +0,0 @@ -mod decoder; -pub use decoder::*; - -mod entry_reader; -pub use entry_reader::*; - -mod read_zip; -pub use read_zip::*; diff --git a/tests/data/cp-437.zip b/testdata/cp-437.zip similarity index 100% rename from tests/data/cp-437.zip rename to testdata/cp-437.zip diff --git a/tests/data/crc32-not-streamed.zip b/testdata/crc32-not-streamed.zip similarity index 100% rename from tests/data/crc32-not-streamed.zip rename to testdata/crc32-not-streamed.zip diff --git a/tests/data/dd.zip b/testdata/dd.zip similarity index 100% rename from tests/data/dd.zip rename to testdata/dd.zip diff --git a/tests/data/found-me-bzip2.zip b/testdata/found-me-bzip2.zip similarity index 100% rename from tests/data/found-me-bzip2.zip rename to testdata/found-me-bzip2.zip diff --git a/tests/data/found-me-deflate64.zip b/testdata/found-me-deflate64.zip similarity index 100% rename from tests/data/found-me-deflate64.zip rename to testdata/found-me-deflate64.zip diff --git a/tests/data/found-me-lzma.zip b/testdata/found-me-lzma.zip similarity index 100% rename from tests/data/found-me-lzma.zip rename to testdata/found-me-lzma.zip diff --git a/tests/data/found-me-zstd.zip b/testdata/found-me-zstd.zip similarity index 100% rename from tests/data/found-me-zstd.zip rename to testdata/found-me-zstd.zip diff --git a/tests/data/found-me.txt b/testdata/found-me.txt similarity index 100% rename from tests/data/found-me.txt rename to testdata/found-me.txt diff --git a/tests/data/go-no-datadesc-sig.zip b/testdata/go-no-datadesc-sig.zip similarity index 100% rename from tests/data/go-no-datadesc-sig.zip rename to testdata/go-no-datadesc-sig.zip diff --git a/tests/data/go-with-datadesc-sig.zip b/testdata/go-with-datadesc-sig.zip similarity index 100% rename from tests/data/go-with-datadesc-sig.zip rename to testdata/go-with-datadesc-sig.zip diff --git a/tests/data/gophercolor16x16.png b/testdata/gophercolor16x16.png similarity index 100% rename from tests/data/gophercolor16x16.png rename to testdata/gophercolor16x16.png diff --git a/tests/data/readme.notzip b/testdata/readme.notzip similarity index 100% rename from tests/data/readme.notzip rename to testdata/readme.notzip diff --git a/tests/data/readme.trailingzip b/testdata/readme.trailingzip similarity index 100% rename from tests/data/readme.trailingzip rename to testdata/readme.trailingzip diff --git a/tests/data/readme.zip b/testdata/readme.zip similarity index 100% rename from tests/data/readme.zip rename to testdata/readme.zip diff --git a/tests/data/shift-jis.zip b/testdata/shift-jis.zip similarity index 100% rename from tests/data/shift-jis.zip rename to testdata/shift-jis.zip diff --git a/tests/data/symlink.zip b/testdata/symlink.zip similarity index 100% rename from tests/data/symlink.zip rename to testdata/symlink.zip diff --git a/tests/data/test-trailing-junk.zip b/testdata/test-trailing-junk.zip similarity index 100% rename from tests/data/test-trailing-junk.zip rename to testdata/test-trailing-junk.zip diff --git a/tests/data/test.zip b/testdata/test.zip similarity index 100% rename from tests/data/test.zip rename to testdata/test.zip diff --git a/tests/data/time-22738.zip b/testdata/time-22738.zip similarity index 100% rename from tests/data/time-22738.zip rename to testdata/time-22738.zip diff --git a/tests/data/time-7zip.zip b/testdata/time-7zip.zip similarity index 100% rename from tests/data/time-7zip.zip rename to testdata/time-7zip.zip diff --git a/tests/data/time-go.zip b/testdata/time-go.zip similarity index 100% rename from tests/data/time-go.zip rename to testdata/time-go.zip diff --git a/tests/data/time-infozip.zip b/testdata/time-infozip.zip similarity index 100% rename from tests/data/time-infozip.zip rename to testdata/time-infozip.zip diff --git a/tests/data/time-osx.zip b/testdata/time-osx.zip similarity index 100% rename from tests/data/time-osx.zip rename to testdata/time-osx.zip diff --git a/tests/data/time-win7.zip b/testdata/time-win7.zip similarity index 100% rename from tests/data/time-win7.zip rename to testdata/time-win7.zip diff --git a/tests/data/time-winrar.zip b/testdata/time-winrar.zip similarity index 100% rename from tests/data/time-winrar.zip rename to testdata/time-winrar.zip diff --git a/tests/data/time-winzip.zip b/testdata/time-winzip.zip similarity index 100% rename from tests/data/time-winzip.zip rename to testdata/time-winzip.zip diff --git a/tests/data/unix.zip b/testdata/unix.zip similarity index 100% rename from tests/data/unix.zip rename to testdata/unix.zip diff --git a/tests/data/utf8-7zip.zip b/testdata/utf8-7zip.zip similarity index 100% rename from tests/data/utf8-7zip.zip rename to testdata/utf8-7zip.zip diff --git a/tests/data/utf8-infozip.zip b/testdata/utf8-infozip.zip similarity index 100% rename from tests/data/utf8-infozip.zip rename to testdata/utf8-infozip.zip diff --git a/tests/data/utf8-osx.zip b/testdata/utf8-osx.zip similarity index 100% rename from tests/data/utf8-osx.zip rename to testdata/utf8-osx.zip diff --git a/tests/data/utf8-winrar.zip b/testdata/utf8-winrar.zip similarity index 100% rename from tests/data/utf8-winrar.zip rename to testdata/utf8-winrar.zip diff --git a/tests/data/utf8-winzip.zip b/testdata/utf8-winzip.zip similarity index 100% rename from tests/data/utf8-winzip.zip rename to testdata/utf8-winzip.zip diff --git a/tests/data/winxp.zip b/testdata/winxp.zip similarity index 100% rename from tests/data/winxp.zip rename to testdata/winxp.zip diff --git a/tests/data/zip64-2.zip b/testdata/zip64-2.zip similarity index 100% rename from tests/data/zip64-2.zip rename to testdata/zip64-2.zip diff --git a/tests/data/zip64.zip b/testdata/zip64.zip similarity index 100% rename from tests/data/zip64.zip rename to testdata/zip64.zip diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs deleted file mode 100644 index b7741b7..0000000 --- a/tests/integration_tests.rs +++ /dev/null @@ -1,397 +0,0 @@ -use chrono::{ - offset::{FixedOffset, Utc}, - DateTime, TimeZone, Timelike, -}; -use rc_zip::{ - prelude::*, - reader::sync::{HasCursor, SyncArchive, SyncStoredEntry}, - Archive, Encoding, -}; -use std::{fs::File, path::PathBuf}; - -enum ZipSource { - File(&'static str), - Func(&'static str, Box Vec>), -} - -struct ZipTest { - source: ZipSource, - expected_encoding: Option, - comment: Option<&'static str>, - files: Vec, - error: Option, -} - -impl Default for ZipTest { - fn default() -> Self { - Self { - source: ZipSource::Func("default.zip", Box::new(|| unreachable!())), - expected_encoding: None, - comment: None, - files: vec![], - error: None, - } - } -} - -impl ZipTest { - fn check(&self, archive: Result, rc_zip::Error>) { - let case_bytes = self.bytes(); - - if let Some(expected) = &self.error { - let actual = match archive { - Err(e) => e, - Ok(_) => panic!("should have failed"), - }; - let expected = format!("{:#?}", expected); - let actual = format!("{:#?}", actual); - assert_eq!(expected, actual); - return; - } - let archive = archive.unwrap(); - - assert_eq!(case_bytes.len() as u64, archive.size()); - - if let Some(expected) = self.comment { - assert_eq!(expected, archive.comment().expect("should have comment")) - } - - if let Some(exp_encoding) = self.expected_encoding { - println!("{}: should be {}", self.name(), exp_encoding); - assert_eq!(archive.encoding(), exp_encoding); - } - - assert_eq!( - self.files.len(), - archive.entries().count(), - "{} should have {} entries files", - self.name(), - self.files.len() - ); - - for f in &self.files { - f.check(&archive); - } - } -} - -struct ZipTestFile { - name: &'static str, - mode: Option, - modified: Option>, - content: FileContent, -} - -impl ZipTestFile { - fn check(&self, archive: &SyncArchive<'_, F>) { - let entry = archive - .by_name(self.name) - .unwrap_or_else(|| panic!("entry {} should exist", self.name)); - - let archive_inner: &Archive = archive; - let entry_inner = archive_inner.by_name(self.name).unwrap(); - assert_eq!(entry.name(), entry_inner.name()); - - self.check_against(entry); - } - - fn check_against(&self, entry: SyncStoredEntry<'_, F>) { - if let Some(expected) = self.modified { - assert_eq!( - expected, - entry.modified(), - "entry {} should have modified = {:?}", - entry.name(), - expected - ) - } - - if let Some(mode) = self.mode { - assert_eq!(entry.mode.0 & 0o777, mode); - } - - // I have honestly yet to see a zip file _entry_ with a comment. - assert!(entry.comment().is_none()); - - match entry.contents() { - rc_zip::EntryContents::File => { - let actual_bytes = entry.bytes().unwrap(); - - match &self.content { - FileContent::Unchecked => { - // ah well - } - FileContent::Bytes(expected_bytes) => { - // first check length - assert_eq!(actual_bytes.len(), expected_bytes.len()); - assert_eq!(&actual_bytes[..], &expected_bytes[..]) - } - FileContent::File(file_path) => { - let expected_bytes = std::fs::read(zips_dir().join(file_path)).unwrap(); - // first check length - assert_eq!(actual_bytes.len(), expected_bytes.len()); - assert_eq!(&actual_bytes[..], &expected_bytes[..]) - } - } - } - rc_zip::EntryContents::Symlink | rc_zip::EntryContents::Directory => { - assert!(matches!(self.content, FileContent::Unchecked)); - } - } - } -} - -enum FileContent { - Unchecked, - Bytes(Vec), - File(&'static str), -} - -impl Default for ZipTestFile { - fn default() -> Self { - Self { - name: "default", - mode: None, - modified: None, - content: FileContent::Unchecked, - } - } -} - -impl ZipTest { - fn name(&self) -> &'static str { - match &self.source { - ZipSource::File(name) => name, - ZipSource::Func(name, _f) => name, - } - } - - // Read source archive from disk - fn bytes(&self) -> Vec { - match &self.source { - ZipSource::File(name) => std::fs::read(zips_dir().join(name)).unwrap(), - ZipSource::Func(_name, f) => f(), - } - } -} - -fn zips_dir() -> PathBuf { - PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .join("tests") - .join("data") -} - -fn time_zone(hours: i32) -> FixedOffset { - FixedOffset::east_opt(hours * 3600).unwrap() -} - -fn date( - (year, month, day): (i32, u32, u32), - (hour, min, sec): (u32, u32, u32), - nsec: u32, - offset: FixedOffset, -) -> Option> { - Some( - offset - .with_ymd_and_hms(year, month, day, hour, min, sec) - .single()? - .with_nanosecond(nsec)? - .into(), - ) -} - -fn test_cases() -> Vec { - vec![ - ZipTest { - source: ZipSource::File("zip64.zip"), - files: vec![ZipTestFile { - name: "README", - content: FileContent::Bytes( - "This small file is in ZIP64 format.\n".as_bytes().into(), - ), - modified: Some(date((2012, 8, 10), (14, 33, 32), 0, time_zone(0)).unwrap()), - mode: Some(0o644), - }], - ..Default::default() - }, - ZipTest { - source: ZipSource::File("test.zip"), - comment: Some("This is a zipfile comment."), - expected_encoding: Some(Encoding::Utf8), - files: vec![ - ZipTestFile { - name: "test.txt", - content: FileContent::Bytes("This is a test text file.\n".as_bytes().into()), - modified: Some(date((2010, 9, 5), (12, 12, 1), 0, time_zone(10)).unwrap()), - mode: Some(0o644), - }, - ZipTestFile { - name: "gophercolor16x16.png", - content: FileContent::File("gophercolor16x16.png"), - modified: Some(date((2010, 9, 5), (15, 52, 58), 0, time_zone(10)).unwrap()), - mode: Some(0o644), - }, - ], - ..Default::default() - }, - ZipTest { - source: ZipSource::File("cp-437.zip"), - expected_encoding: Some(Encoding::Cp437), - files: vec![ZipTestFile { - name: "français", - ..Default::default() - }], - ..Default::default() - }, - ZipTest { - source: ZipSource::File("shift-jis.zip"), - expected_encoding: Some(Encoding::ShiftJis), - files: vec![ - ZipTestFile { - name: "should-be-jis/", - ..Default::default() - }, - ZipTestFile { - name: "should-be-jis/ot_運命のワルツネぞなぞ小さな楽しみ遊びま.longboi", - ..Default::default() - }, - ], - ..Default::default() - }, - ZipTest { - source: ZipSource::File("utf8-winrar.zip"), - expected_encoding: Some(Encoding::Utf8), - files: vec![ZipTestFile { - name: "世界", - content: FileContent::Bytes(vec![]), - modified: Some(date((2017, 11, 6), (21, 9, 27), 867862500, time_zone(0)).unwrap()), - ..Default::default() - }], - ..Default::default() - }, - #[cfg(feature = "lzma")] - ZipTest { - source: ZipSource::File("found-me-lzma.zip"), - expected_encoding: Some(Encoding::Utf8), - files: vec![ZipTestFile { - name: "found-me.txt", - content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), - modified: Some(date((2024, 1, 26), (16, 14, 35), 46003100, time_zone(0)).unwrap()), - ..Default::default() - }], - ..Default::default() - }, - #[cfg(feature = "deflate64")] - ZipTest { - source: ZipSource::File("found-me-deflate64.zip"), - expected_encoding: Some(Encoding::Utf8), - files: vec![ZipTestFile { - name: "found-me.txt", - content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), - modified: Some(date((2024, 1, 26), (16, 14, 35), 46003100, time_zone(0)).unwrap()), - ..Default::default() - }], - ..Default::default() - }, - // same with bzip2 - #[cfg(feature = "bzip2")] - ZipTest { - source: ZipSource::File("found-me-bzip2.zip"), - expected_encoding: Some(Encoding::Utf8), - files: vec![ZipTestFile { - name: "found-me.txt", - content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), - modified: Some(date((2024, 1, 26), (16, 14, 35), 46003100, time_zone(0)).unwrap()), - ..Default::default() - }], - ..Default::default() - }, - // same with zstd - #[cfg(feature = "zstd")] - ZipTest { - source: ZipSource::File("found-me-zstd.zip"), - expected_encoding: Some(Encoding::Utf8), - files: vec![ZipTestFile { - name: "found-me.txt", - content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), - modified: Some(date((2024, 1, 31), (6, 10, 25), 800491400, time_zone(0)).unwrap()), - ..Default::default() - }], - ..Default::default() - }, - ] -} - -#[test_log::test] -fn read_from_slice() { - let bytes = std::fs::read(zips_dir().join("test.zip")).unwrap(); - let slice = &bytes[..]; - let archive = slice.read_zip().unwrap(); - assert_eq!(archive.entries().count(), 2); -} - -#[test_log::test] -fn read_from_file() { - let f = File::open(zips_dir().join("test.zip")).unwrap(); - let archive = f.read_zip().unwrap(); - assert_eq!(archive.entries().count(), 2); -} - -#[test_log::test] -fn real_world_files() { - for case in test_cases() { - tracing::trace!("============ testing {}", case.name()); - case.check(case.bytes().read_zip()); - } -} - -#[test_log::test] -fn state_machine() { - use rc_zip::reader::{ArchiveReader, ArchiveReaderResult}; - - let cases = test_cases(); - let case = cases.iter().find(|x| x.name() == "zip64.zip").unwrap(); - let bs = case.bytes(); - let mut zar = ArchiveReader::new(bs.len() as u64); - - let archive = 'read_zip: loop { - if let Some(offset) = zar.wants_read() { - let increment = 128usize; - let offset = offset as usize; - let mut slice = if offset + increment > bs.len() { - &bs[offset..] - } else { - &bs[offset..offset + increment] - }; - - match zar.read(&mut slice) { - Ok(0) => panic!("EOF!"), - Ok(read_bytes) => { - println!("at {}, zar read {} bytes", offset, read_bytes); - } - Err(err) => { - println!("at {}, zar encountered an error:", offset); - panic!("{}", err) - } - } - } - - match zar.process() { - Ok(res) => match res { - ArchiveReaderResult::Continue => {} - ArchiveReaderResult::Done(archive) => break 'read_zip archive, - }, - Err(err) => { - println!("zar processing error: {:#?}", err); - panic!("{}", err) - } - } - }; - - let sync_archive = bs.read_zip().unwrap(); - for (se, e) in sync_archive.entries().zip(archive.entries()) { - assert_eq!(se.name(), e.name()); - assert_eq!(se.inner.compressed_size, e.inner.compressed_size); - assert_eq!(se.inner.uncompressed_size, e.inner.uncompressed_size); - } -}