From 8039e2c7e9c60bc24291bb0a04ffe5c482c1e9a6 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 26 Jan 2024 18:10:05 +0100 Subject: [PATCH 1/5] wip: Add lzma support --- .vscode/settings.json | 3 ++ Cargo.lock | 28 +++++++++++++++++++ crates/jean/Cargo.toml | 5 ++++ crates/rc-zip/Cargo.toml | 3 ++ crates/rc-zip/src/reader/sync/decoder.rs | 17 +++++++++++ crates/rc-zip/src/reader/sync/entry_reader.rs | 28 ++++++++++++++++++- crates/rc-zip/src/tests.rs | 12 ++++++++ 7 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..4ac34f4 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "rust-analyzer.cargo.features": ["default", "lzma"] +} \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 53136bc..4cca98b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -367,6 +367,17 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "memchr" version = "2.7.1" @@ -512,6 +523,12 @@ version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" +[[package]] +name = "pkg-config" +version = "0.3.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2900ede94e305130c13ddd391e0ab7cbaeb783945ae07a279c268cb05109c6cb" + [[package]] name = "portable-atomic" version = "1.6.0" @@ -581,6 +598,7 @@ checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" name = "rc-zip" version = "2.0.1" dependencies = [ + "byteorder", "chardetng", "chrono", "circular", @@ -594,6 +612,7 @@ dependencies = [ "pretty-hex", "thiserror", "tracing", + "xz2", ] [[package]] @@ -958,3 +977,12 @@ checksum = "b7cf47b659b318dccbd69cc4797a39ae128f533dce7902a1096044d1967b9c16" dependencies = [ "memchr", ] + +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] diff --git a/crates/jean/Cargo.toml b/crates/jean/Cargo.toml index e56f0a7..77f8643 100644 --- a/crates/jean/Cargo.toml +++ b/crates/jean/Cargo.toml @@ -14,3 +14,8 @@ humansize = "2.1.3" positioned-io.workspace = true indicatif = "0.17.7" tracing-subscriber = "0.3.18" + +[features] +default = ["lzma"] +deflate = ["rc-zip/deflate"] +lzma = ["rc-zip/lzma"] diff --git a/crates/rc-zip/Cargo.toml b/crates/rc-zip/Cargo.toml index 5229842..9c49809 100644 --- a/crates/rc-zip/Cargo.toml +++ b/crates/rc-zip/Cargo.toml @@ -25,9 +25,12 @@ thiserror = "1.0.56" chardetng = "0.1.17" flate2 = { version = "1.0.28", optional = true } num_enum = "0.7.2" +xz2 = { version = "0.1.7", optional = true } +byteorder = "1.5.0" [features] default = ["sync", "file", "deflate"] sync = [] file = ["positioned-io"] deflate = ["flate2"] +lzma = ["xz2"] diff --git a/crates/rc-zip/src/reader/sync/decoder.rs b/crates/rc-zip/src/reader/sync/decoder.rs index 058e503..479d129 100644 --- a/crates/rc-zip/src/reader/sync/decoder.rs +++ b/crates/rc-zip/src/reader/sync/decoder.rs @@ -1,6 +1,9 @@ #[cfg(feature = "deflate")] use flate2::read::DeflateDecoder; +#[cfg(feature = "lzma")] +use xz2::read::XzDecoder; + use std::{cmp, io}; pub trait Decoder: io::Read @@ -29,6 +32,20 @@ where } } +#[cfg(feature = "lzma")] +impl Decoder for XzDecoder +where + R: io::Read, +{ + fn into_inner(self: Box) -> R { + XzDecoder::into_inner(*self) + } + + fn get_mut(&mut self) -> &mut R { + XzDecoder::get_mut(self) + } +} + pub struct StoreDecoder where R: io::Read, diff --git a/crates/rc-zip/src/reader/sync/entry_reader.rs b/crates/rc-zip/src/reader/sync/entry_reader.rs index f82e873..0ef3067 100644 --- a/crates/rc-zip/src/reader/sync/entry_reader.rs +++ b/crates/rc-zip/src/reader/sync/entry_reader.rs @@ -72,7 +72,10 @@ where trace!("local file header: {:#?}", header); transition!(self.state => (S::ReadLocalHeader { buffer }) { - let limited_reader = LimitedReader::new(buffer, self.inner.compressed_size); + // allow unnecessary mut for some feature combinations + #[allow(unused_mut)] + let mut limited_reader = LimitedReader::new(buffer, self.inner.compressed_size); + let decoder: Box> = match self.method { Method::Store => Box::new(StoreDecoder::new(limited_reader)), Method::Deflate => { @@ -82,6 +85,29 @@ where #[cfg(not(feature = "deflate"))] { return Err(Error::Unsupported(UnsupportedError::CompressionMethodNotEnabled(Method::Deflate)).into()) } }, + Method::Lzma => { + #[cfg(feature = "lzma")] + { + // TODO: use a parser combinator library for this probably + + // read LZMA properties header first. + use byteorder::{LittleEndian, ReadBytesExt}; + let major: u8 = limited_reader.read_u8()?; + let minor: u8 = limited_reader.read_u8()?; + + let size: u16 = limited_reader.read_u16::()?; + // this is an u16, worse case scenario is 65536 bytes + let mut data = [0u8; 1 << 16]; + limited_reader.read_exact(&mut data[..size as usize])?; + let data = &data[..size as usize]; + trace!(%major, %minor, %size, "LZMA properties header, data = {data:02x?}"); + + Box::new(xz2::read::XzDecoder::new_stream(limited_reader, xz2::stream::Stream::new_lzma_decoder(128 * 1024 * 1024)?)) + } + + #[cfg(not(feature = "lzma"))] + { return Err(Error::Unsupported(UnsupportedError::CompressionMethodNotEnabled(Method::Lzma)).into()) } + } method => return Err(Error::Unsupported(UnsupportedError::UnsupportedCompressionMethod(method)).into()), }; diff --git a/crates/rc-zip/src/tests.rs b/crates/rc-zip/src/tests.rs index 2684521..e3bef67 100644 --- a/crates/rc-zip/src/tests.rs +++ b/crates/rc-zip/src/tests.rs @@ -266,6 +266,18 @@ fn test_cases() -> Vec { }], ..Default::default() }, + #[cfg(feature = "lzma")] + ZipTest { + source: ZipSource::File("found-me-lzma.zip"), + expected_encoding: Some(Encoding::Utf8), + files: vec![ZipTestFile { + name: "found-me.txt", + content: FileContent::Bytes("Oh no, you found me!\n".repeat(5000).into()), + modified: Some(date((2024, 1, 26), (17, 14, 36), 0, time_zone(0)).unwrap()), + ..Default::default() + }], + ..Default::default() + }, ] } From 5f9fb74bb4a27e30848d2f604d4db7e7dd1228a2 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 26 Jan 2024 18:15:29 +0100 Subject: [PATCH 2/5] Add .direnv --- .envrc | 1 + Cargo.lock | 38 +++++++------- crates/rc-zip/Cargo.toml | 4 +- flake.lock | 106 +++++++++++++++++++++++++++++++++++++++ flake.nix | 76 ++++++++++++++++++++++++++++ 5 files changed, 202 insertions(+), 23 deletions(-) create mode 100644 .envrc create mode 100644 flake.lock create mode 100644 flake.nix diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..3550a30 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +use flake diff --git a/Cargo.lock b/Cargo.lock index 4cca98b..f64bdac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -367,17 +367,6 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" -[[package]] -name = "lzma-sys" -version = "0.1.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - [[package]] name = "memchr" version = "2.7.1" @@ -610,9 +599,19 @@ dependencies = [ "oem_cp", "positioned-io", "pretty-hex", + "rust-lzma", "thiserror", "tracing", - "xz2", +] + +[[package]] +name = "rust-lzma" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d62915608f6cee1d7f2fc00f28b4f058ff79d6e4ec3c2fe0006b09b52437c84" +dependencies = [ + "pkg-config", + "vcpkg", ] [[package]] @@ -818,6 +817,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "wasm-bindgen" version = "0.2.90" @@ -977,12 +982,3 @@ checksum = "b7cf47b659b318dccbd69cc4797a39ae128f533dce7902a1096044d1967b9c16" dependencies = [ "memchr", ] - -[[package]] -name = "xz2" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" -dependencies = [ - "lzma-sys", -] diff --git a/crates/rc-zip/Cargo.toml b/crates/rc-zip/Cargo.toml index 9c49809..e2ba977 100644 --- a/crates/rc-zip/Cargo.toml +++ b/crates/rc-zip/Cargo.toml @@ -25,12 +25,12 @@ thiserror = "1.0.56" chardetng = "0.1.17" flate2 = { version = "1.0.28", optional = true } num_enum = "0.7.2" -xz2 = { version = "0.1.7", optional = true } byteorder = "1.5.0" +rust-lzma = { version = "0.6.0", optional = true } [features] default = ["sync", "file", "deflate"] sync = [] file = ["positioned-io"] deflate = ["flate2"] -lzma = ["xz2"] +lzma = ["rust-lzma"] diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..46b24b8 --- /dev/null +++ b/flake.lock @@ -0,0 +1,106 @@ +{ + "nodes": { + "crane": { + "inputs": { + "nixpkgs": [ + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1705974079, + "narHash": "sha256-HyC3C2esW57j6bG0MKwX4kQi25ltslRnr6z2uvpadJo=", + "owner": "ipetkov", + "repo": "crane", + "rev": "0b4e511fe6e346381e31d355e03de52aa43e8cb2", + "type": "github" + }, + "original": { + "owner": "ipetkov", + "repo": "crane", + "type": "github" + } + }, + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1705309234, + "narHash": "sha256-uNRRNRKmJyCRC/8y1RqBkqWBLM034y4qN7EprSdmgyA=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "1ef2e671c3b0c19053962c07dbda38332dcebf26", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1705856552, + "narHash": "sha256-JXfnuEf5Yd6bhMs/uvM67/joxYKoysyE3M2k6T3eWbg=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "612f97239e2cc474c13c9dafa0df378058c5ad8d", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "crane": "crane", + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs", + "rust-overlay": "rust-overlay" + } + }, + "rust-overlay": { + "inputs": { + "flake-utils": [ + "flake-utils" + ], + "nixpkgs": [ + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1706235145, + "narHash": "sha256-3jh5nahTlcsX6QFcMPqxtLn9p9CgT9RSce5GLqjcpi4=", + "owner": "oxalica", + "repo": "rust-overlay", + "rev": "3a57c4e29cb2beb777b2e6ae7309a680585b8b2f", + "type": "github" + }, + "original": { + "owner": "oxalica", + "repo": "rust-overlay", + "type": "github" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..a3b1e57 --- /dev/null +++ b/flake.nix @@ -0,0 +1,76 @@ +{ + inputs = { + flake-utils = { url = "github:numtide/flake-utils"; }; + nixpkgs = { url = "github:NixOS/nixpkgs/nixos-unstable"; }; + rust-overlay = + { + url = "github:oxalica/rust-overlay"; + inputs = { + nixpkgs.follows = "nixpkgs"; + flake-utils.follows = "flake-utils"; + }; + }; + crane = { + url = "github:ipetkov/crane"; + inputs = { + nixpkgs.follows = "nixpkgs"; + }; + }; + }; + outputs = + { self, nixpkgs, flake-utils, rust-overlay, crane }: + flake-utils.lib.eachDefaultSystem (system: + let + pkgs = import nixpkgs { + inherit system; + overlays = [ (import rust-overlay) ]; + }; + rustToolchain = pkgs.pkgsBuildHost.rust-bin.fromRustupToolchainFile ./rust-toolchain.toml; + craneLib = (crane.mkLib pkgs).overrideToolchain rustToolchain; + src = craneLib.cleanCargoSource (craneLib.path ./.); + + buildInputs = with pkgs; [ pkgs.stdenv.cc.cc ]; + nativeBuildInputs = with pkgs; [ + rustToolchain + clang + mold + pkg-config + xz + ] + ++ lib.optionals pkgs.stdenv.isLinux [ autoPatchelfHook ] + ++ lib.optionals pkgs.stdenv.isDarwin + (with pkgs.darwin.apple_sdk.frameworks; [ + CoreFoundation + CoreServices + SystemConfiguration + Security + ]); + commonArgs = { + pname = "fluke"; + version = "latest"; + strictDeps = true; + dontStrip = true; + # workaround for https://github.com/NixOS/nixpkgs/issues/166205 + env = with pkgs; lib.optionalAttrs stdenv.cc.isClang { + NIX_LDFLAGS = "-l${stdenv.cc.libcxx.cxxabi.libName}"; + }; + inherit src buildInputs nativeBuildInputs; + }; + cargoArtifacts = craneLib.buildDepsOnly commonArgs; + bin = craneLib.buildPackage (commonArgs // { + inherit cargoArtifacts; + }); + in + with pkgs; + { + packages = { + inherit bin; + default = bin; + }; + devShells.default = mkShell { + inputsFrom = [ bin ]; + packages = with pkgs; [ just nixpkgs-fmt ]; + }; + } + ); +} From 5fe9d15812ff6e778b1035c3371eac9be8f69a50 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 26 Jan 2024 18:33:06 +0100 Subject: [PATCH 3/5] Okay forget lzma --- .direnv/flake-profile | 1 + .direnv/flake-profile-1-link | 1 + Cargo.lock | 2 -- crates/rc-zip/Cargo.toml | 4 +++- crates/rc-zip/src/reader/sync/decoder.rs | 12 ++++++------ crates/rc-zip/src/reader/sync/entry_reader.rs | 2 +- crates/rc-zip/testdata/test-zips/found-me.txt | 0 flake.nix | 2 +- rust-toolchain.toml | 3 +++ 9 files changed, 16 insertions(+), 11 deletions(-) create mode 120000 .direnv/flake-profile create mode 120000 .direnv/flake-profile-1-link create mode 100644 crates/rc-zip/testdata/test-zips/found-me.txt create mode 100644 rust-toolchain.toml diff --git a/.direnv/flake-profile b/.direnv/flake-profile new file mode 120000 index 0000000..0c05709 --- /dev/null +++ b/.direnv/flake-profile @@ -0,0 +1 @@ +flake-profile-1-link \ No newline at end of file diff --git a/.direnv/flake-profile-1-link b/.direnv/flake-profile-1-link new file mode 120000 index 0000000..356f6b6 --- /dev/null +++ b/.direnv/flake-profile-1-link @@ -0,0 +1 @@ +/nix/store/bdw4nizg6fg6i13gyi43skk3clwg7fjs-nix-shell-env \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index f64bdac..6f055ea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -607,8 +607,6 @@ dependencies = [ [[package]] name = "rust-lzma" version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d62915608f6cee1d7f2fc00f28b4f058ff79d6e4ec3c2fe0006b09b52437c84" dependencies = [ "pkg-config", "vcpkg", diff --git a/crates/rc-zip/Cargo.toml b/crates/rc-zip/Cargo.toml index e2ba977..5ec9c42 100644 --- a/crates/rc-zip/Cargo.toml +++ b/crates/rc-zip/Cargo.toml @@ -26,7 +26,9 @@ chardetng = "0.1.17" flate2 = { version = "1.0.28", optional = true } num_enum = "0.7.2" byteorder = "1.5.0" -rust-lzma = { version = "0.6.0", optional = true } +# rust-lzma = { version = "0.6.0", optional = true } +# FIXME: +rust-lzma = { path = "../../../rust-lzma", optional = true } [features] default = ["sync", "file", "deflate"] diff --git a/crates/rc-zip/src/reader/sync/decoder.rs b/crates/rc-zip/src/reader/sync/decoder.rs index 479d129..88a8de3 100644 --- a/crates/rc-zip/src/reader/sync/decoder.rs +++ b/crates/rc-zip/src/reader/sync/decoder.rs @@ -2,7 +2,7 @@ use flate2::read::DeflateDecoder; #[cfg(feature = "lzma")] -use xz2::read::XzDecoder; +use lzma::reader::LzmaReader; use std::{cmp, io}; @@ -24,25 +24,25 @@ where R: io::Read, { fn into_inner(self: Box) -> R { - DeflateDecoder::into_inner(*self) + Self::into_inner(*self) } fn get_mut(&mut self) -> &mut R { - DeflateDecoder::get_mut(self) + Self::get_mut(self) } } #[cfg(feature = "lzma")] -impl Decoder for XzDecoder +impl Decoder for LzmaReader where R: io::Read, { fn into_inner(self: Box) -> R { - XzDecoder::into_inner(*self) + Self::into_inner(*self) } fn get_mut(&mut self) -> &mut R { - XzDecoder::get_mut(self) + Self::get_mut(self) } } diff --git a/crates/rc-zip/src/reader/sync/entry_reader.rs b/crates/rc-zip/src/reader/sync/entry_reader.rs index 0ef3067..b1f2130 100644 --- a/crates/rc-zip/src/reader/sync/entry_reader.rs +++ b/crates/rc-zip/src/reader/sync/entry_reader.rs @@ -102,7 +102,7 @@ where let data = &data[..size as usize]; trace!(%major, %minor, %size, "LZMA properties header, data = {data:02x?}"); - Box::new(xz2::read::XzDecoder::new_stream(limited_reader, xz2::stream::Stream::new_lzma_decoder(128 * 1024 * 1024)?)) + Box::new(lzma::reader::LzmaReader::new_decompressor(limited_reader).map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?) } #[cfg(not(feature = "lzma"))] diff --git a/crates/rc-zip/testdata/test-zips/found-me.txt b/crates/rc-zip/testdata/test-zips/found-me.txt new file mode 100644 index 0000000..e69de29 diff --git a/flake.nix b/flake.nix index a3b1e57..be8ef3e 100644 --- a/flake.nix +++ b/flake.nix @@ -46,7 +46,7 @@ Security ]); commonArgs = { - pname = "fluke"; + pname = "rc_zip"; version = "latest"; strictDeps = true; dontStrip = true; diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000..d24f9dc --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,3 @@ +[toolchain] +channel = "stable" +components = ["llvm-tools", "clippy", "rust-src"] From cb3355b041a4fdc9c51a2686b541a7198fd4593a Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 26 Jan 2024 19:22:30 +0100 Subject: [PATCH 4/5] Struggle with lzma support --- Cargo.lock | 37 +- crates/rc-zip/Cargo.toml | 7 +- crates/rc-zip/src/error.rs | 4 + crates/rc-zip/src/reader/sync/decoder.rs | 5 +- crates/rc-zip/src/reader/sync/entry_reader.rs | 156 ++- crates/rc-zip/testdata/lzma-specification.txt | 1176 +++++++++++++++++ crates/rc-zip/testdata/test-zips/.gitignore | 1 + crates/rc-zip/testdata/test-zips/found-me.txt | 0 8 files changed, 1326 insertions(+), 60 deletions(-) create mode 100644 crates/rc-zip/testdata/lzma-specification.txt create mode 100644 crates/rc-zip/testdata/test-zips/.gitignore delete mode 100644 crates/rc-zip/testdata/test-zips/found-me.txt diff --git a/Cargo.lock b/Cargo.lock index 6f055ea..558fa3d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -367,6 +367,17 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "memchr" version = "2.7.1" @@ -588,6 +599,7 @@ name = "rc-zip" version = "2.0.1" dependencies = [ "byteorder", + "cfg-if", "chardetng", "chrono", "circular", @@ -599,17 +611,9 @@ dependencies = [ "oem_cp", "positioned-io", "pretty-hex", - "rust-lzma", "thiserror", "tracing", -] - -[[package]] -name = "rust-lzma" -version = "0.6.0" -dependencies = [ - "pkg-config", - "vcpkg", + "xz2", ] [[package]] @@ -815,12 +819,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" -[[package]] -name = "vcpkg" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" - [[package]] name = "wasm-bindgen" version = "0.2.90" @@ -980,3 +978,12 @@ checksum = "b7cf47b659b318dccbd69cc4797a39ae128f533dce7902a1096044d1967b9c16" dependencies = [ "memchr", ] + +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] diff --git a/crates/rc-zip/Cargo.toml b/crates/rc-zip/Cargo.toml index 5ec9c42..c053200 100644 --- a/crates/rc-zip/Cargo.toml +++ b/crates/rc-zip/Cargo.toml @@ -26,13 +26,12 @@ chardetng = "0.1.17" flate2 = { version = "1.0.28", optional = true } num_enum = "0.7.2" byteorder = "1.5.0" -# rust-lzma = { version = "0.6.0", optional = true } -# FIXME: -rust-lzma = { path = "../../../rust-lzma", optional = true } +xz2 = { version = "0.1.7", optional = true } +cfg-if = "1.0.0" [features] default = ["sync", "file", "deflate"] sync = [] file = ["positioned-io"] deflate = ["flate2"] -lzma = ["rust-lzma"] +lzma = ["xz2"] diff --git a/crates/rc-zip/src/error.rs b/crates/rc-zip/src/error.rs index a8894f9..28f2510 100644 --- a/crates/rc-zip/src/error.rs +++ b/crates/rc-zip/src/error.rs @@ -30,6 +30,10 @@ pub enum UnsupportedError { UnsupportedCompressionMethod(crate::format::Method), #[error("compression method supported, but not enabled in this build: {0:?}")] CompressionMethodNotEnabled(crate::format::Method), + #[error("only LZMA2.0 is supported, found LZMA{minor}.{major}")] + LzmaVersionUnsupported { minor: u8, major: u8 }, + #[error("LZMA properties header too short: expected {expected} bytes, got {actual} bytes")] + LzmaPropertiesHeaderTooShort { expected: u16, actual: u16 }, } /// Specific zip format errors, mostly due to invalid zip archives but that could also stem from diff --git a/crates/rc-zip/src/reader/sync/decoder.rs b/crates/rc-zip/src/reader/sync/decoder.rs index 88a8de3..ab81303 100644 --- a/crates/rc-zip/src/reader/sync/decoder.rs +++ b/crates/rc-zip/src/reader/sync/decoder.rs @@ -1,9 +1,6 @@ #[cfg(feature = "deflate")] use flate2::read::DeflateDecoder; -#[cfg(feature = "lzma")] -use lzma::reader::LzmaReader; - use std::{cmp, io}; pub trait Decoder: io::Read @@ -33,7 +30,7 @@ where } #[cfg(feature = "lzma")] -impl Decoder for LzmaReader +impl Decoder for xz2::read::XzDecoder where R: io::Read, { diff --git a/crates/rc-zip/src/reader/sync/entry_reader.rs b/crates/rc-zip/src/reader/sync/entry_reader.rs index b1f2130..b78ce6b 100644 --- a/crates/rc-zip/src/reader/sync/entry_reader.rs +++ b/crates/rc-zip/src/reader/sync/entry_reader.rs @@ -7,12 +7,14 @@ use crate::{ transition, }; -#[cfg(feature = "deflate")] -use flate2::read::DeflateDecoder; +use cfg_if::cfg_if; use nom::Offset; use std::io; use tracing::trace; +#[cfg(feature = "deflate")] +use flate2::read::DeflateDecoder; + struct EntryReadMetrics { uncompressed_size: u64, crc32: u32, @@ -75,41 +77,7 @@ where // allow unnecessary mut for some feature combinations #[allow(unused_mut)] let mut limited_reader = LimitedReader::new(buffer, self.inner.compressed_size); - - let decoder: Box> = match self.method { - Method::Store => Box::new(StoreDecoder::new(limited_reader)), - Method::Deflate => { - #[cfg(feature = "deflate")] - { Box::new(DeflateDecoder::new(limited_reader)) } - - #[cfg(not(feature = "deflate"))] - { return Err(Error::Unsupported(UnsupportedError::CompressionMethodNotEnabled(Method::Deflate)).into()) } - }, - Method::Lzma => { - #[cfg(feature = "lzma")] - { - // TODO: use a parser combinator library for this probably - - // read LZMA properties header first. - use byteorder::{LittleEndian, ReadBytesExt}; - let major: u8 = limited_reader.read_u8()?; - let minor: u8 = limited_reader.read_u8()?; - - let size: u16 = limited_reader.read_u16::()?; - // this is an u16, worse case scenario is 65536 bytes - let mut data = [0u8; 1 << 16]; - limited_reader.read_exact(&mut data[..size as usize])?; - let data = &data[..size as usize]; - trace!(%major, %minor, %size, "LZMA properties header, data = {data:02x?}"); - - Box::new(lzma::reader::LzmaReader::new_decompressor(limited_reader).map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?) - } - - #[cfg(not(feature = "lzma"))] - { return Err(Error::Unsupported(UnsupportedError::CompressionMethodNotEnabled(Method::Lzma)).into()) } - } - method => return Err(Error::Unsupported(UnsupportedError::UnsupportedCompressionMethod(method)).into()), - }; + let decoder: Box> = self.get_decoder(limited_reader)?; S::ReadData { hasher: crc32fast::Hasher::new(), @@ -284,4 +252,118 @@ where inner: entry.inner, } } + + fn get_decoder( + &self, + #[allow(unused_mut)] mut limited_reader: LimitedReader, + ) -> std::io::Result>> { + let decoder: Box> = match self.method { + Method::Store => Box::new(StoreDecoder::new(limited_reader)), + Method::Deflate => { + cfg_if! { + if #[cfg(feature = "deflate")] { + Box::new(DeflateDecoder::new(limited_reader)) + } else { + return Err( + Error::Unsupported(UnsupportedError::CompressionMethodNotEnabled( + Method::Deflate, + )) + .into(), + ); + } + } + } + Method::Lzma => { + cfg_if! { + if #[cfg(feature = "lzma")] { + // TODO: use a parser combinator library for this probably? + + // read LZMA properties header first. + use byteorder::{LittleEndian, ReadBytesExt}; + let major: u8 = limited_reader.read_u8()?; + let minor: u8 = limited_reader.read_u8()?; + if (major, minor) != (2, 0) { + return Err( + Error::Unsupported(UnsupportedError::LzmaVersionUnsupported { + minor, + major, + }) + .into(), + ); + } + + let props_size: u16 = limited_reader.read_u16::()?; + + const LZMA_2_0_PROPS_SIZE: u16 = 5; + if props_size != LZMA_2_0_PROPS_SIZE { + return Err(Error::Unsupported( + UnsupportedError::LzmaPropertiesHeaderTooShort { + expected: 5, + actual: props_size, + }, + ) + .into()); + } + let bits_byte: u8 = limited_reader.read_u8()?; + + #[derive(Debug, Clone, Copy)] + struct LzmaProperties { + literal_context_bits: u8, + literal_pos_state_bits: u8, + pos_state_bits: u8, + } + + // from `lzma-specification.txt` + fn decode_properties(mut d: u8) -> LzmaProperties { + let lc = d % 9; + d /= 9; + let pb = d / 5; + let lp = d % 5; + + LzmaProperties { + literal_context_bits: lc, + literal_pos_state_bits: lp, + pos_state_bits: pb, + } + } + + let props = decode_properties(bits_byte); + const LZMA_DIC_MIN: u32 = 1 << 12; + let dict_size: u32 = + std::cmp::min(LZMA_DIC_MIN, limited_reader.read_u32::()?); + + let mut opts = xz2::stream::LzmaOptions::new_preset(0)?; + opts.dict_size(dict_size); + opts.position_bits(props.pos_state_bits as _); + opts.literal_position_bits(props.literal_pos_state_bits as _); + opts.literal_context_bits(props.literal_context_bits as _); + + let mut filters = xz2::stream::Filters::new(); + filters.lzma2(&opts); + // let stream = xz2::stream::Stream::new_lzma_decoder(&filters)?; + + // let stream = xz2::stream::Stream::new_lzma_encoder(&opts)?; + let stream = xz2::stream::Stream::new_stream_encoder(&filters, xz2::stream::Check::None)?; + + Box::new(xz2::read::XzDecoder::new_stream(limited_reader, stream)) + } else { + return Err( + Error::Unsupported(UnsupportedError::CompressionMethodNotEnabled( + Method::Lzma, + )) + .into(), + ); + } + } + } + method => { + return Err( + Error::Unsupported(UnsupportedError::UnsupportedCompressionMethod(method)) + .into(), + ) + } + }; + + Ok(decoder) + } } diff --git a/crates/rc-zip/testdata/lzma-specification.txt b/crates/rc-zip/testdata/lzma-specification.txt new file mode 100644 index 0000000..ac0cce7 --- /dev/null +++ b/crates/rc-zip/testdata/lzma-specification.txt @@ -0,0 +1,1176 @@ +LZMA specification (DRAFT version) +---------------------------------- + +Author: Igor Pavlov +Date: 2015-06-14 + +This specification defines the format of LZMA compressed data and lzma file format. + +Notation +-------- + +We use the syntax of C++ programming language. +We use the following types in C++ code: + unsigned - unsigned integer, at least 16 bits in size + int - signed integer, at least 16 bits in size + UInt64 - 64-bit unsigned integer + UInt32 - 32-bit unsigned integer + UInt16 - 16-bit unsigned integer + Byte - 8-bit unsigned integer + bool - boolean type with two possible values: false, true + + +lzma file format +================ + +The lzma file contains the raw LZMA stream and the header with related properties. + +The files in that format use ".lzma" extension. + +The lzma file format layout: + +Offset Size Description + + 0 1 LZMA model properties (lc, lp, pb) in encoded form + 1 4 Dictionary size (32-bit unsigned integer, little-endian) + 5 8 Uncompressed size (64-bit unsigned integer, little-endian) + 13 Compressed data (LZMA stream) + +LZMA properties: + + name Range Description + + lc [0, 8] the number of "literal context" bits + lp [0, 4] the number of "literal pos" bits + pb [0, 4] the number of "pos" bits +dictSize [0, 2^32 - 1] the dictionary size + +The following code encodes LZMA properties: + +void EncodeProperties(Byte *properties) +{ + properties[0] = (Byte)((pb * 5 + lp) * 9 + lc); + Set_UInt32_LittleEndian(properties + 1, dictSize); +} + +If the value of dictionary size in properties is smaller than (1 << 12), +the LZMA decoder must set the dictionary size variable to (1 << 12). + +#define LZMA_DIC_MIN (1 << 12) + + unsigned lc, pb, lp; + UInt32 dictSize; + UInt32 dictSizeInProperties; + + void DecodeProperties(const Byte *properties) + { + unsigned d = properties[0]; + if (d >= (9 * 5 * 5)) + throw "Incorrect LZMA properties"; + lc = d % 9; + d /= 9; + pb = d / 5; + lp = d % 5; + dictSizeInProperties = 0; + for (int i = 0; i < 4; i++) + dictSizeInProperties |= (UInt32)properties[i + 1] << (8 * i); + dictSize = dictSizeInProperties; + if (dictSize < LZMA_DIC_MIN) + dictSize = LZMA_DIC_MIN; + } + +If "Uncompressed size" field contains ones in all 64 bits, it means that +uncompressed size is unknown and there is the "end marker" in stream, +that indicates the end of decoding point. +In opposite case, if the value from "Uncompressed size" field is not +equal to ((2^64) - 1), the LZMA stream decoding must be finished after +specified number of bytes (Uncompressed size) is decoded. And if there +is the "end marker", the LZMA decoder must read that marker also. + + +The new scheme to encode LZMA properties +---------------------------------------- + +If LZMA compression is used for some another format, it's recommended to +use a new improved scheme to encode LZMA properties. That new scheme was +used in xz format that uses the LZMA2 compression algorithm. +The LZMA2 is a new compression algorithm that is based on the LZMA algorithm. + +The dictionary size in LZMA2 is encoded with just one byte and LZMA2 supports +only reduced set of dictionary sizes: + (2 << 11), (3 << 11), + (2 << 12), (3 << 12), + ... + (2 << 30), (3 << 30), + (2 << 31) - 1 + +The dictionary size can be extracted from encoded value with the following code: + + dictSize = (p == 40) ? 0xFFFFFFFF : (((UInt32)2 | ((p) & 1)) << ((p) / 2 + 11)); + +Also there is additional limitation (lc + lp <= 4) in LZMA2 for values of +"lc" and "lp" properties: + + if (lc + lp > 4) + throw "Unsupported properties: (lc + lp) > 4"; + +There are some advantages for LZMA decoder with such (lc + lp) value +limitation. It reduces the maximum size of tables allocated by decoder. +And it reduces the complexity of initialization procedure, that can be +important to keep high speed of decoding of big number of small LZMA streams. + +It's recommended to use that limitation (lc + lp <= 4) for any new format +that uses LZMA compression. Note that the combinations of "lc" and "lp" +parameters, where (lc + lp > 4), can provide significant improvement in +compression ratio only in some rare cases. + +The LZMA properties can be encoded into two bytes in new scheme: + +Offset Size Description + + 0 1 The dictionary size encoded with LZMA2 scheme + 1 1 LZMA model properties (lc, lp, pb) in encoded form + + +The RAM usage +============= + +The RAM usage for LZMA decoder is determined by the following parts: + +1) The Sliding Window (from 4 KiB to 4 GiB). +2) The probability model counter arrays (arrays of 16-bit variables). +3) Some additional state variables (about 10 variables of 32-bit integers). + + +The RAM usage for Sliding Window +-------------------------------- + +There are two main scenarios of decoding: + +1) The decoding of full stream to one RAM buffer. + + If we decode full LZMA stream to one output buffer in RAM, the decoder + can use that output buffer as sliding window. So the decoder doesn't + need additional buffer allocated for sliding window. + +2) The decoding to some external storage. + + If we decode LZMA stream to external storage, the decoder must allocate + the buffer for sliding window. The size of that buffer must be equal + or larger than the value of dictionary size from properties of LZMA stream. + +In this specification we describe the code for decoding to some external +storage. The optimized version of code for decoding of full stream to one +output RAM buffer can require some minor changes in code. + + +The RAM usage for the probability model counters +------------------------------------------------ + +The size of the probability model counter arrays is calculated with the +following formula: + +size_of_prob_arrays = 1846 + 768 * (1 << (lp + lc)) + +Each probability model counter is 11-bit unsigned integer. +If we use 16-bit integer variables (2-byte integers) for these probability +model counters, the RAM usage required by probability model counter arrays +can be estimated with the following formula: + + RAM = 4 KiB + 1.5 KiB * (1 << (lp + lc)) + +For example, for default LZMA parameters (lp = 0 and lc = 3), the RAM usage is + + RAM_lc3_lp0 = 4 KiB + 1.5 KiB * 8 = 16 KiB + +The maximum RAM state usage is required for decoding the stream with lp = 4 +and lc = 8: + + RAM_lc8_lp4 = 4 KiB + 1.5 KiB * 4096 = 6148 KiB + +If the decoder uses LZMA2's limited property condition +(lc + lp <= 4), the RAM usage will be not larger than + + RAM_lc_lp_4 = 4 KiB + 1.5 KiB * 16 = 28 KiB + + +The RAM usage for encoder +------------------------- + +There are many variants for LZMA encoding code. +These variants have different values for memory consumption. +Note that memory consumption for LZMA Encoder can not be +smaller than memory consumption of LZMA Decoder for same stream. + +The RAM usage required by modern effective implementation of +LZMA Encoder can be estimated with the following formula: + + Encoder_RAM_Usage = 4 MiB + 11 * dictionarySize. + +But there are some modes of the encoder that require less memory. + + +LZMA Decoding +============= + +The LZMA compression algorithm uses LZ-based compression with Sliding Window +and Range Encoding as entropy coding method. + + +Sliding Window +-------------- + +LZMA uses Sliding Window compression similar to LZ77 algorithm. + +LZMA stream must be decoded to the sequence that consists +of MATCHES and LITERALS: + + - a LITERAL is a 8-bit character (one byte). + The decoder just puts that LITERAL to the uncompressed stream. + + - a MATCH is a pair of two numbers (DISTANCE-LENGTH pair). + The decoder takes one byte exactly "DISTANCE" characters behind + current position in the uncompressed stream and puts it to + uncompressed stream. The decoder must repeat it "LENGTH" times. + +The "DISTANCE" can not be larger than dictionary size. +And the "DISTANCE" can not be larger than the number of bytes in +the uncompressed stream that were decoded before that match. + +In this specification we use cyclic buffer to implement Sliding Window +for LZMA decoder: + +class COutWindow +{ + Byte *Buf; + UInt32 Pos; + UInt32 Size; + bool IsFull; + +public: + unsigned TotalPos; + COutStream OutStream; + + COutWindow(): Buf(NULL) {} + ~COutWindow() { delete []Buf; } + + void Create(UInt32 dictSize) + { + Buf = new Byte[dictSize]; + Pos = 0; + Size = dictSize; + IsFull = false; + TotalPos = 0; + } + + void PutByte(Byte b) + { + TotalPos++; + Buf[Pos++] = b; + if (Pos == Size) + { + Pos = 0; + IsFull = true; + } + OutStream.WriteByte(b); + } + + Byte GetByte(UInt32 dist) const + { + return Buf[dist <= Pos ? Pos - dist : Size - dist + Pos]; + } + + void CopyMatch(UInt32 dist, unsigned len) + { + for (; len > 0; len--) + PutByte(GetByte(dist)); + } + + bool CheckDistance(UInt32 dist) const + { + return dist <= Pos || IsFull; + } + + bool IsEmpty() const + { + return Pos == 0 && !IsFull; + } +}; + + +In another implementation it's possible to use one buffer that contains +Sliding Window and the whole data stream after uncompressing. + + +Range Decoder +------------- + +LZMA algorithm uses Range Encoding (1) as entropy coding method. + +LZMA stream contains just one very big number in big-endian encoding. +LZMA decoder uses the Range Decoder to extract a sequence of binary +symbols from that big number. + +The state of the Range Decoder: + +struct CRangeDecoder +{ + UInt32 Range; + UInt32 Code; + InputStream *InStream; + + bool Corrupted; +} + +The notes about UInt32 type for the "Range" and "Code" variables: + + It's possible to use 64-bit (unsigned or signed) integer type + for the "Range" and the "Code" variables instead of 32-bit unsigned, + but some additional code must be used to truncate the values to + low 32-bits after some operations. + + If the programming language does not support 32-bit unsigned integer type + (like in case of JAVA language), it's possible to use 32-bit signed integer, + but some code must be changed. For example, it's required to change the code + that uses comparison operations for UInt32 variables in this specification. + +The Range Decoder can be in some states that can be treated as +"Corruption" in LZMA stream. The Range Decoder uses the variable "Corrupted": + + (Corrupted == false), if the Range Decoder has not detected any corruption. + (Corrupted == true), if the Range Decoder has detected some corruption. + +The reference LZMA Decoder ignores the value of the "Corrupted" variable. +So it continues to decode the stream, even if the corruption can be detected +in the Range Decoder. To provide the full compatibility with output of the +reference LZMA Decoder, another LZMA Decoder implementations must also +ignore the value of the "Corrupted" variable. + +The LZMA Encoder is required to create only such LZMA streams, that will not +lead the Range Decoder to states, where the "Corrupted" variable is set to true. + +The Range Decoder reads first 5 bytes from input stream to initialize +the state: + +bool CRangeDecoder::Init() +{ + Corrupted = false; + Range = 0xFFFFFFFF; + Code = 0; + + Byte b = InStream->ReadByte(); + + for (int i = 0; i < 4; i++) + Code = (Code << 8) | InStream->ReadByte(); + + if (b != 0 || Code == Range) + Corrupted = true; + return b == 0; +} + +The LZMA Encoder always writes ZERO in initial byte of compressed stream. +That scheme allows to simplify the code of the Range Encoder in the +LZMA Encoder. If initial byte is not equal to ZERO, the LZMA Decoder must +stop decoding and report error. + +After the last bit of data was decoded by Range Decoder, the value of the +"Code" variable must be equal to 0. The LZMA Decoder must check it by +calling the IsFinishedOK() function: + + bool IsFinishedOK() const { return Code == 0; } + +If there is corruption in data stream, there is big probability that +the "Code" value will be not equal to 0 in the Finish() function. So that +check in the IsFinishedOK() function provides very good feature for +corruption detection. + +The value of the "Range" variable before each bit decoding can not be smaller +than ((UInt32)1 << 24). The Normalize() function keeps the "Range" value in +described range. + +#define kTopValue ((UInt32)1 << 24) + +void CRangeDecoder::Normalize() +{ + if (Range < kTopValue) + { + Range <<= 8; + Code = (Code << 8) | InStream->ReadByte(); + } +} + +Notes: if the size of the "Code" variable is larger than 32 bits, it's +required to keep only low 32 bits of the "Code" variable after the change +in Normalize() function. + +If the LZMA Stream is not corrupted, the value of the "Code" variable is +always smaller than value of the "Range" variable. +But the Range Decoder ignores some types of corruptions, so the value of +the "Code" variable can be equal or larger than value of the "Range" variable +for some "Corrupted" archives. + + +LZMA uses Range Encoding only with binary symbols of two types: + 1) binary symbols with fixed and equal probabilities (direct bits) + 2) binary symbols with predicted probabilities + +The DecodeDirectBits() function decodes the sequence of direct bits: + +UInt32 CRangeDecoder::DecodeDirectBits(unsigned numBits) +{ + UInt32 res = 0; + do + { + Range >>= 1; + Code -= Range; + UInt32 t = 0 - ((UInt32)Code >> 31); + Code += Range & t; + + if (Code == Range) + Corrupted = true; + + Normalize(); + res <<= 1; + res += t + 1; + } + while (--numBits); + return res; +} + + +The Bit Decoding with Probability Model +--------------------------------------- + +The task of Bit Probability Model is to estimate probabilities of binary +symbols. And then it provides the Range Decoder with that information. +The better prediction provides better compression ratio. +The Bit Probability Model uses statistical data of previous decoded +symbols. + +That estimated probability is presented as 11-bit unsigned integer value +that represents the probability of symbol "0". + +#define kNumBitModelTotalBits 11 + +Mathematical probabilities can be presented with the following formulas: + probability(symbol_0) = prob / 2048. + probability(symbol_1) = 1 - Probability(symbol_0) = + = 1 - prob / 2048 = + = (2048 - prob) / 2048 +where the "prob" variable contains 11-bit integer probability counter. + +It's recommended to use 16-bit unsigned integer type, to store these 11-bit +probability values: + +typedef UInt16 CProb; + +Each probability value must be initialized with value ((1 << 11) / 2), +that represents the state, where probabilities of symbols 0 and 1 +are equal to 0.5: + +#define PROB_INIT_VAL ((1 << kNumBitModelTotalBits) / 2) + +The INIT_PROBS macro is used to initialize the array of CProb variables: + +#define INIT_PROBS(p) \ + { for (unsigned i = 0; i < sizeof(p) / sizeof(p[0]); i++) p[i] = PROB_INIT_VAL; } + + +The DecodeBit() function decodes one bit. +The LZMA decoder provides the pointer to CProb variable that contains +information about estimated probability for symbol 0 and the Range Decoder +updates that CProb variable after decoding. The Range Decoder increases +estimated probability of the symbol that was decoded: + +#define kNumMoveBits 5 + +unsigned CRangeDecoder::DecodeBit(CProb *prob) +{ + unsigned v = *prob; + UInt32 bound = (Range >> kNumBitModelTotalBits) * v; + unsigned symbol; + if (Code < bound) + { + v += ((1 << kNumBitModelTotalBits) - v) >> kNumMoveBits; + Range = bound; + symbol = 0; + } + else + { + v -= v >> kNumMoveBits; + Code -= bound; + Range -= bound; + symbol = 1; + } + *prob = (CProb)v; + Normalize(); + return symbol; +} + + +The Binary Tree of bit model counters +------------------------------------- + +LZMA uses a tree of Bit model variables to decode symbol that needs +several bits for storing. There are two versions of such trees in LZMA: + 1) the tree that decodes bits from high bit to low bit (the normal scheme). + 2) the tree that decodes bits from low bit to high bit (the reverse scheme). + +Each binary tree structure supports different size of decoded symbol +(the size of binary sequence that contains value of symbol). +If that size of decoded symbol is "NumBits" bits, the tree structure +uses the array of (2 << NumBits) counters of CProb type. +But only ((2 << NumBits) - 1) items are used by encoder and decoder. +The first item (the item with index equal to 0) in array is unused. +That scheme with unused array's item allows to simplify the code. + +unsigned BitTreeReverseDecode(CProb *probs, unsigned numBits, CRangeDecoder *rc) +{ + unsigned m = 1; + unsigned symbol = 0; + for (unsigned i = 0; i < numBits; i++) + { + unsigned bit = rc->DecodeBit(&probs[m]); + m <<= 1; + m += bit; + symbol |= (bit << i); + } + return symbol; +} + +template +class CBitTreeDecoder +{ + CProb Probs[(unsigned)1 << NumBits]; + +public: + + void Init() + { + INIT_PROBS(Probs); + } + + unsigned Decode(CRangeDecoder *rc) + { + unsigned m = 1; + for (unsigned i = 0; i < NumBits; i++) + m = (m << 1) + rc->DecodeBit(&Probs[m]); + return m - ((unsigned)1 << NumBits); + } + + unsigned ReverseDecode(CRangeDecoder *rc) + { + return BitTreeReverseDecode(Probs, NumBits, rc); + } +}; + + +LZ part of LZMA +--------------- + +LZ part of LZMA describes details about the decoding of MATCHES and LITERALS. + + +The Literal Decoding +-------------------- + +The LZMA Decoder uses (1 << (lc + lp)) tables with CProb values, where +each table contains 0x300 CProb values: + + CProb *LitProbs; + + void CreateLiterals() + { + LitProbs = new CProb[(UInt32)0x300 << (lc + lp)]; + } + + void InitLiterals() + { + UInt32 num = (UInt32)0x300 << (lc + lp); + for (UInt32 i = 0; i < num; i++) + LitProbs[i] = PROB_INIT_VAL; + } + +To select the table for decoding it uses the context that consists of +(lc) high bits from previous literal and (lp) low bits from value that +represents current position in outputStream. + +If (State > 7), the Literal Decoder also uses "matchByte" that represents +the byte in OutputStream at position the is the DISTANCE bytes before +current position, where the DISTANCE is the distance in DISTANCE-LENGTH pair +of latest decoded match. + +The following code decodes one literal and puts it to Sliding Window buffer: + + void DecodeLiteral(unsigned state, UInt32 rep0) + { + unsigned prevByte = 0; + if (!OutWindow.IsEmpty()) + prevByte = OutWindow.GetByte(1); + + unsigned symbol = 1; + unsigned litState = ((OutWindow.TotalPos & ((1 << lp) - 1)) << lc) + (prevByte >> (8 - lc)); + CProb *probs = &LitProbs[(UInt32)0x300 * litState]; + + if (state >= 7) + { + unsigned matchByte = OutWindow.GetByte(rep0 + 1); + do + { + unsigned matchBit = (matchByte >> 7) & 1; + matchByte <<= 1; + unsigned bit = RangeDec.DecodeBit(&probs[((1 + matchBit) << 8) + symbol]); + symbol = (symbol << 1) | bit; + if (matchBit != bit) + break; + } + while (symbol < 0x100); + } + while (symbol < 0x100) + symbol = (symbol << 1) | RangeDec.DecodeBit(&probs[symbol]); + OutWindow.PutByte((Byte)(symbol - 0x100)); + } + + +The match length decoding +------------------------- + +The match length decoder returns normalized (zero-based value) +length of match. That value can be converted to real length of the match +with the following code: + +#define kMatchMinLen 2 + + matchLen = len + kMatchMinLen; + +The match length decoder can return the values from 0 to 271. +And the corresponded real match length values can be in the range +from 2 to 273. + +The following scheme is used for the match length encoding: + + Binary encoding Binary Tree structure Zero-based match length + sequence (binary + decimal): + + 0 xxx LowCoder[posState] xxx + 1 0 yyy MidCoder[posState] yyy + 8 + 1 1 zzzzzzzz HighCoder zzzzzzzz + 16 + +LZMA uses bit model variable "Choice" to decode the first selection bit. + +If the first selection bit is equal to 0, the decoder uses binary tree + LowCoder[posState] to decode 3-bit zero-based match length (xxx). + +If the first selection bit is equal to 1, the decoder uses bit model + variable "Choice2" to decode the second selection bit. + + If the second selection bit is equal to 0, the decoder uses binary tree + MidCoder[posState] to decode 3-bit "yyy" value, and zero-based match + length is equal to (yyy + 8). + + If the second selection bit is equal to 1, the decoder uses binary tree + HighCoder to decode 8-bit "zzzzzzzz" value, and zero-based + match length is equal to (zzzzzzzz + 16). + +LZMA uses "posState" value as context to select the binary tree +from LowCoder and MidCoder binary tree arrays: + + unsigned posState = OutWindow.TotalPos & ((1 << pb) - 1); + +The full code of the length decoder: + +class CLenDecoder +{ + CProb Choice; + CProb Choice2; + CBitTreeDecoder<3> LowCoder[1 << kNumPosBitsMax]; + CBitTreeDecoder<3> MidCoder[1 << kNumPosBitsMax]; + CBitTreeDecoder<8> HighCoder; + +public: + + void Init() + { + Choice = PROB_INIT_VAL; + Choice2 = PROB_INIT_VAL; + HighCoder.Init(); + for (unsigned i = 0; i < (1 << kNumPosBitsMax); i++) + { + LowCoder[i].Init(); + MidCoder[i].Init(); + } + } + + unsigned Decode(CRangeDecoder *rc, unsigned posState) + { + if (rc->DecodeBit(&Choice) == 0) + return LowCoder[posState].Decode(rc); + if (rc->DecodeBit(&Choice2) == 0) + return 8 + MidCoder[posState].Decode(rc); + return 16 + HighCoder.Decode(rc); + } +}; + +The LZMA decoder uses two instances of CLenDecoder class. +The first instance is for the matches of "Simple Match" type, +and the second instance is for the matches of "Rep Match" type: + + CLenDecoder LenDecoder; + CLenDecoder RepLenDecoder; + + +The match distance decoding +--------------------------- + +LZMA supports dictionary sizes up to 4 GiB minus 1. +The value of match distance (decoded by distance decoder) can be +from 1 to 2^32. But the distance value that is equal to 2^32 is used to +indicate the "End of stream" marker. So real largest match distance +that is used for LZ-window match is (2^32 - 1). + +LZMA uses normalized match length (zero-based length) +to calculate the context state "lenState" do decode the distance value: + +#define kNumLenToPosStates 4 + + unsigned lenState = len; + if (lenState > kNumLenToPosStates - 1) + lenState = kNumLenToPosStates - 1; + +The distance decoder returns the "dist" value that is zero-based value +of match distance. The real match distance can be calculated with the +following code: + + matchDistance = dist + 1; + +The state of the distance decoder and the initialization code: + + #define kEndPosModelIndex 14 + #define kNumFullDistances (1 << (kEndPosModelIndex >> 1)) + #define kNumAlignBits 4 + + CBitTreeDecoder<6> PosSlotDecoder[kNumLenToPosStates]; + CProb PosDecoders[1 + kNumFullDistances - kEndPosModelIndex]; + CBitTreeDecoder AlignDecoder; + + void InitDist() + { + for (unsigned i = 0; i < kNumLenToPosStates; i++) + PosSlotDecoder[i].Init(); + AlignDecoder.Init(); + INIT_PROBS(PosDecoders); + } + +At first stage the distance decoder decodes 6-bit "posSlot" value with bit +tree decoder from PosSlotDecoder array. It's possible to get 2^6=64 different +"posSlot" values. + + unsigned posSlot = PosSlotDecoder[lenState].Decode(&RangeDec); + +The encoding scheme for distance value is shown in the following table: + +posSlot (decimal) / + zero-based distance (binary) + 0 0 + 1 1 + 2 10 + 3 11 + + 4 10 x + 5 11 x + 6 10 xx + 7 11 xx + 8 10 xxx + 9 11 xxx +10 10 xxxx +11 11 xxxx +12 10 xxxxx +13 11 xxxxx + +14 10 yy zzzz +15 11 yy zzzz +16 10 yyy zzzz +17 11 yyy zzzz +... +62 10 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz +63 11 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz + +where + "x ... x" means the sequence of binary symbols encoded with binary tree and + "Reverse" scheme. It uses separated binary tree for each posSlot from 4 to 13. + "y" means direct bit encoded with range coder. + "zzzz" means the sequence of four binary symbols encoded with binary + tree with "Reverse" scheme, where one common binary tree "AlignDecoder" + is used for all posSlot values. + +If (posSlot < 4), the "dist" value is equal to posSlot value. + +If (posSlot >= 4), the decoder uses "posSlot" value to calculate the value of + the high bits of "dist" value and the number of the low bits. + + If (4 <= posSlot < kEndPosModelIndex), the decoder uses bit tree decoders. + (one separated bit tree decoder per one posSlot value) and "Reverse" scheme. + In this implementation we use one CProb array "PosDecoders" that contains + all CProb variables for all these bit decoders. + + if (posSlot >= kEndPosModelIndex), the middle bits are decoded as direct + bits from RangeDecoder and the low 4 bits are decoded with a bit tree + decoder "AlignDecoder" with "Reverse" scheme. + +The code to decode zero-based match distance: + + unsigned DecodeDistance(unsigned len) + { + unsigned lenState = len; + if (lenState > kNumLenToPosStates - 1) + lenState = kNumLenToPosStates - 1; + + unsigned posSlot = PosSlotDecoder[lenState].Decode(&RangeDec); + if (posSlot < 4) + return posSlot; + + unsigned numDirectBits = (unsigned)((posSlot >> 1) - 1); + UInt32 dist = ((2 | (posSlot & 1)) << numDirectBits); + if (posSlot < kEndPosModelIndex) + dist += BitTreeReverseDecode(PosDecoders + dist - posSlot, numDirectBits, &RangeDec); + else + { + dist += RangeDec.DecodeDirectBits(numDirectBits - kNumAlignBits) << kNumAlignBits; + dist += AlignDecoder.ReverseDecode(&RangeDec); + } + return dist; + } + + + +LZMA Decoding modes +------------------- + +There are 2 types of LZMA streams: + +1) The stream with "End of stream" marker. +2) The stream without "End of stream" marker. + +And the LZMA Decoder supports 3 modes of decoding: + +1) The unpack size is undefined. The LZMA decoder stops decoding after + getting "End of stream" marker. + The input variables for that case: + + markerIsMandatory = true + unpackSizeDefined = false + unpackSize contains any value + +2) The unpack size is defined and LZMA decoder supports both variants, + where the stream can contain "End of stream" marker or the stream is + finished without "End of stream" marker. The LZMA decoder must detect + any of these situations. + The input variables for that case: + + markerIsMandatory = false + unpackSizeDefined = true + unpackSize contains unpack size + +3) The unpack size is defined and the LZMA stream must contain + "End of stream" marker + The input variables for that case: + + markerIsMandatory = true + unpackSizeDefined = true + unpackSize contains unpack size + + +The main loop of decoder +------------------------ + +The main loop of LZMA decoder: + +Initialize the LZMA state. +loop +{ + // begin of loop + Check "end of stream" conditions. + Decode Type of MATCH / LITERAL. + If it's LITERAL, decode LITERAL value and put the LITERAL to Window. + If it's MATCH, decode the length of match and the match distance. + Check error conditions, check end of stream conditions and copy + the sequence of match bytes from sliding window to current position + in window. + Go to begin of loop +} + +The reference implementation of LZMA decoder uses "unpackSize" variable +to keep the number of remaining bytes in output stream. So it reduces +"unpackSize" value after each decoded LITERAL or MATCH. + +The following code contains the "end of stream" condition check at the start +of the loop: + + if (unpackSizeDefined && unpackSize == 0 && !markerIsMandatory) + if (RangeDec.IsFinishedOK()) + return LZMA_RES_FINISHED_WITHOUT_MARKER; + +LZMA uses three types of matches: + +1) "Simple Match" - the match with distance value encoded with bit models. + +2) "Rep Match" - the match that uses the distance from distance + history table. + +3) "Short Rep Match" - the match of single byte length, that uses the latest + distance from distance history table. + +The LZMA decoder keeps the history of latest 4 match distances that were used +by decoder. That set of 4 variables contains zero-based match distances and +these variables are initialized with zero values: + + UInt32 rep0 = 0, rep1 = 0, rep2 = 0, rep3 = 0; + +The LZMA decoder uses binary model variables to select type of MATCH or LITERAL: + +#define kNumStates 12 +#define kNumPosBitsMax 4 + + CProb IsMatch[kNumStates << kNumPosBitsMax]; + CProb IsRep[kNumStates]; + CProb IsRepG0[kNumStates]; + CProb IsRepG1[kNumStates]; + CProb IsRepG2[kNumStates]; + CProb IsRep0Long[kNumStates << kNumPosBitsMax]; + +The decoder uses "state" variable value to select exact variable +from "IsRep", "IsRepG0", "IsRepG1" and "IsRepG2" arrays. +The "state" variable can get the value from 0 to 11. +Initial value for "state" variable is zero: + + unsigned state = 0; + +The "state" variable is updated after each LITERAL or MATCH with one of the +following functions: + +unsigned UpdateState_Literal(unsigned state) +{ + if (state < 4) return 0; + else if (state < 10) return state - 3; + else return state - 6; +} +unsigned UpdateState_Match (unsigned state) { return state < 7 ? 7 : 10; } +unsigned UpdateState_Rep (unsigned state) { return state < 7 ? 8 : 11; } +unsigned UpdateState_ShortRep(unsigned state) { return state < 7 ? 9 : 11; } + +The decoder calculates "state2" variable value to select exact variable from +"IsMatch" and "IsRep0Long" arrays: + +unsigned posState = OutWindow.TotalPos & ((1 << pb) - 1); +unsigned state2 = (state << kNumPosBitsMax) + posState; + +The decoder uses the following code flow scheme to select exact +type of LITERAL or MATCH: + +IsMatch[state2] decode + 0 - the Literal + 1 - the Match + IsRep[state] decode + 0 - Simple Match + 1 - Rep Match + IsRepG0[state] decode + 0 - the distance is rep0 + IsRep0Long[state2] decode + 0 - Short Rep Match + 1 - Rep Match 0 + 1 - + IsRepG1[state] decode + 0 - Rep Match 1 + 1 - + IsRepG2[state] decode + 0 - Rep Match 2 + 1 - Rep Match 3 + + +LITERAL symbol +-------------- +If the value "0" was decoded with IsMatch[state2] decoding, we have "LITERAL" type. + +At first the LZMA decoder must check that it doesn't exceed +specified uncompressed size: + + if (unpackSizeDefined && unpackSize == 0) + return LZMA_RES_ERROR; + +Then it decodes literal value and puts it to sliding window: + + DecodeLiteral(state, rep0); + +Then the decoder must update the "state" value and "unpackSize" value; + + state = UpdateState_Literal(state); + unpackSize--; + +Then the decoder must go to the begin of main loop to decode next Match or Literal. + + +Simple Match +------------ + +If the value "1" was decoded with IsMatch[state2] decoding, +we have the "Simple Match" type. + +The distance history table is updated with the following scheme: + + rep3 = rep2; + rep2 = rep1; + rep1 = rep0; + +The zero-based length is decoded with "LenDecoder": + + len = LenDecoder.Decode(&RangeDec, posState); + +The state is update with UpdateState_Match function: + + state = UpdateState_Match(state); + +and the new "rep0" value is decoded with DecodeDistance: + + rep0 = DecodeDistance(len); + +That "rep0" will be used as zero-based distance for current match. + +If the value of "rep0" is equal to 0xFFFFFFFF, it means that we have +"End of stream" marker, so we can stop decoding and check finishing +condition in Range Decoder: + + if (rep0 == 0xFFFFFFFF) + return RangeDec.IsFinishedOK() ? + LZMA_RES_FINISHED_WITH_MARKER : + LZMA_RES_ERROR; + +If uncompressed size is defined, LZMA decoder must check that it doesn't +exceed that specified uncompressed size: + + if (unpackSizeDefined && unpackSize == 0) + return LZMA_RES_ERROR; + +Also the decoder must check that "rep0" value is not larger than dictionary size +and is not larger than the number of already decoded bytes: + + if (rep0 >= dictSize || !OutWindow.CheckDistance(rep0)) + return LZMA_RES_ERROR; + +Then the decoder must copy match bytes as described in +"The match symbols copying" section. + + +Rep Match +--------- + +If the LZMA decoder has decoded the value "1" with IsRep[state] variable, +we have "Rep Match" type. + +At first the LZMA decoder must check that it doesn't exceed +specified uncompressed size: + + if (unpackSizeDefined && unpackSize == 0) + return LZMA_RES_ERROR; + +Also the decoder must return error, if the LZ window is empty: + + if (OutWindow.IsEmpty()) + return LZMA_RES_ERROR; + +If the match type is "Rep Match", the decoder uses one of the 4 variables of +distance history table to get the value of distance for current match. +And there are 4 corresponding ways of decoding flow. + +The decoder updates the distance history with the following scheme +depending from type of match: + +- "Rep Match 0" or "Short Rep Match": + ; LZMA doesn't update the distance history + +- "Rep Match 1": + UInt32 dist = rep1; + rep1 = rep0; + rep0 = dist; + +- "Rep Match 2": + UInt32 dist = rep2; + rep2 = rep1; + rep1 = rep0; + rep0 = dist; + +- "Rep Match 3": + UInt32 dist = rep3; + rep3 = rep2; + rep2 = rep1; + rep1 = rep0; + rep0 = dist; + +Then the decoder decodes exact subtype of "Rep Match" using "IsRepG0", "IsRep0Long", +"IsRepG1", "IsRepG2". + +If the subtype is "Short Rep Match", the decoder updates the state, puts +the one byte from window to current position in window and goes to next +MATCH/LITERAL symbol (the begin of main loop): + + state = UpdateState_ShortRep(state); + OutWindow.PutByte(OutWindow.GetByte(rep0 + 1)); + unpackSize--; + continue; + +In other cases (Rep Match 0/1/2/3), it decodes the zero-based +length of match with "RepLenDecoder" decoder: + + len = RepLenDecoder.Decode(&RangeDec, posState); + +Then it updates the state: + + state = UpdateState_Rep(state); + +Then the decoder must copy match bytes as described in +"The Match symbols copying" section. + + +The match symbols copying +------------------------- + +If we have the match (Simple Match or Rep Match 0/1/2/3), the decoder must +copy the sequence of bytes with calculated match distance and match length. +If uncompressed size is defined, LZMA decoder must check that it doesn't +exceed that specified uncompressed size: + + len += kMatchMinLen; + bool isError = false; + if (unpackSizeDefined && unpackSize < len) + { + len = (unsigned)unpackSize; + isError = true; + } + OutWindow.CopyMatch(rep0 + 1, len); + unpackSize -= len; + if (isError) + return LZMA_RES_ERROR; + +Then the decoder must go to the begin of main loop to decode next MATCH or LITERAL. + + + +NOTES +----- + +This specification doesn't describe the variant of decoder implementation +that supports partial decoding. Such partial decoding case can require some +changes in "end of stream" condition checks code. Also such code +can use additional status codes, returned by decoder. + +This specification uses C++ code with templates to simplify describing. +The optimized version of LZMA decoder doesn't need templates. +Such optimized version can use just two arrays of CProb variables: + 1) The dynamic array of CProb variables allocated for the Literal Decoder. + 2) The one common array that contains all other CProb variables. + + +References: + +1. G. N. N. Martin, Range encoding: an algorithm for removing redundancy + from a digitized message, Video & Data Recording Conference, + Southampton, UK, July 24-27, 1979. diff --git a/crates/rc-zip/testdata/test-zips/.gitignore b/crates/rc-zip/testdata/test-zips/.gitignore new file mode 100644 index 0000000..106111b --- /dev/null +++ b/crates/rc-zip/testdata/test-zips/.gitignore @@ -0,0 +1 @@ +found-me.txt \ No newline at end of file diff --git a/crates/rc-zip/testdata/test-zips/found-me.txt b/crates/rc-zip/testdata/test-zips/found-me.txt deleted file mode 100644 index e69de29..0000000 From 414e91a6276e4156e61433270bb5ff39364a9306 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 26 Jan 2024 19:30:56 +0100 Subject: [PATCH 5/5] I have idea. --- crates/rc-zip/src/error.rs | 3 ++ crates/rc-zip/src/reader/sync/entry_reader.rs | 39 ++++++++++++------- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/crates/rc-zip/src/error.rs b/crates/rc-zip/src/error.rs index 28f2510..78f7f3e 100644 --- a/crates/rc-zip/src/error.rs +++ b/crates/rc-zip/src/error.rs @@ -97,6 +97,9 @@ pub enum FormatError { /// The CRC-32 checksum didn't match. #[error("checksum didn't match: expected {expected:x?}, got {actual:x?}")] WrongChecksum { expected: u32, actual: u32 }, + + #[error("lzma properties larger than max")] + LzmaPropertiesLargerThanMax, } impl From for std::io::Error { diff --git a/crates/rc-zip/src/reader/sync/entry_reader.rs b/crates/rc-zip/src/reader/sync/entry_reader.rs index b78ce6b..fc82b7c 100644 --- a/crates/rc-zip/src/reader/sync/entry_reader.rs +++ b/crates/rc-zip/src/reader/sync/entry_reader.rs @@ -314,36 +314,47 @@ where } // from `lzma-specification.txt` - fn decode_properties(mut d: u8) -> LzmaProperties { + fn decode_properties(mut d: u8) -> Result { + if d >= (9 * 5 * 5) { + return Err(FormatError::LzmaPropertiesLargerThanMax); + } + let lc = d % 9; d /= 9; let pb = d / 5; let lp = d % 5; - LzmaProperties { + Ok(LzmaProperties { literal_context_bits: lc, literal_pos_state_bits: lp, pos_state_bits: pb, - } + }) } let props = decode_properties(bits_byte); + trace!("LZMA properties: {:#?}", props); + const LZMA_DIC_MIN: u32 = 1 << 12; + let dict_size_read = limited_reader.read_u32::()?; + trace!("LZMA dictionary size (raw): {}", dict_size_read); let dict_size: u32 = - std::cmp::min(LZMA_DIC_MIN, limited_reader.read_u32::()?); + std::cmp::max(LZMA_DIC_MIN, dict_size_read); + trace!("LZMA dictionary size: {}", dict_size); + + // let mut opts = xz2::stream::LzmaOptions::new_preset(0)?; + // opts.dict_size(dict_size); + // opts.position_bits(props.pos_state_bits as _); + // opts.literal_position_bits(props.literal_pos_state_bits as _); + // opts.literal_context_bits(props.literal_context_bits as _); - let mut opts = xz2::stream::LzmaOptions::new_preset(0)?; - opts.dict_size(dict_size); - opts.position_bits(props.pos_state_bits as _); - opts.literal_position_bits(props.literal_pos_state_bits as _); - opts.literal_context_bits(props.literal_context_bits as _); + // let mut filters = xz2::stream::Filters::new(); + // filters.lzma2(&opts); - let mut filters = xz2::stream::Filters::new(); - filters.lzma2(&opts); - // let stream = xz2::stream::Stream::new_lzma_decoder(&filters)?; + // uncompressed size is stored as a little-endian 64-bit integer + let uncompressed_size: u64 = limited_reader.read_u64::()?; + trace!("LZMA uncompressed size: {}", uncompressed_size); - // let stream = xz2::stream::Stream::new_lzma_encoder(&opts)?; - let stream = xz2::stream::Stream::new_stream_encoder(&filters, xz2::stream::Check::None)?; + let stream = xz2::stream::Stream::new_lzma_decoder(128 * 1024 * 1024)?; Box::new(xz2::read::XzDecoder::new_stream(limited_reader, stream)) } else {