From 7b4186bd3d175e7c3e54af65197e3df7afeb419e Mon Sep 17 00:00:00 2001 From: Florian Guggi Date: Wed, 7 Aug 2024 12:35:45 +0200 Subject: [PATCH] Add simple archive crate This crate is used to serialize result files into a simple format of u8 path length - UTF-8 path - u32 data length - data Changelog: Added --- Cargo.lock | 176 +++++++++++++++++++++++++++++++- Cargo.toml | 1 + simple-archive/Cargo.toml | 12 +++ simple-archive/src/lib.rs | 205 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 393 insertions(+), 1 deletion(-) create mode 100644 simple-archive/Cargo.toml create mode 100644 simple-archive/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index e8ef98f..d7857c2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -39,6 +39,30 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + +[[package]] +name = "adler32" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" + +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.3" @@ -48,6 +72,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f" + [[package]] name = "anyhow" version = "1.0.86" @@ -73,7 +103,7 @@ dependencies = [ "cc", "cfg-if", "libc", - "miniz_oxide", + "miniz_oxide 0.7.4", "object", "rustc-demangle", ] @@ -90,6 +120,12 @@ version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + [[package]] name = "byteorder" version = "1.5.0" @@ -114,6 +150,15 @@ version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +[[package]] +name = "core2" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" +dependencies = [ + "memchr", +] + [[package]] name = "crc" version = "3.2.1" @@ -129,6 +174,15 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + [[package]] name = "crossterm" version = "0.25.0" @@ -154,6 +208,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "dary_heap" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7762d17f1241643615821a8455a0b2c3e803784b058693d990b11f2dce25a0ca" + [[package]] name = "deranged" version = "0.3.11" @@ -228,6 +288,16 @@ dependencies = [ "serde", ] +[[package]] +name = "flate2" +version = "1.0.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "324a1be68054ef05ad64b861cc9eaf1d623d2d8cb25b4bf2cb9cdd902b4bf253" +dependencies = [ + "crc32fast", + "miniz_oxide 0.8.0", +] + [[package]] name = "fuzzy-matcher" version = "0.3.7" @@ -257,6 +327,10 @@ name = "hashbrown" version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", + "allocator-api2", +] [[package]] name = "heck" @@ -336,6 +410,30 @@ version = "0.2.155" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" +[[package]] +name = "libflate" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e" +dependencies = [ + "adler32", + "core2", + "crc32fast", + "dary_heap", + "libflate_lz77", +] + +[[package]] +name = "libflate_lz77" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d" +dependencies = [ + "core2", + "hashbrown", + "rle-decode-fast", +] + [[package]] name = "libudev" version = "0.3.0" @@ -372,6 +470,12 @@ dependencies = [ "scopeguard", ] +[[package]] +name = "lockfree-object-pool" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9374ef4228402d4b7e403e5838cb880d9ee663314b0a900d5a6aabf0c213552e" + [[package]] name = "log" version = "0.4.22" @@ -402,6 +506,15 @@ dependencies = [ "adler", ] +[[package]] +name = "miniz_oxide" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +dependencies = [ + "adler2", +] + [[package]] name = "mio" version = "0.8.11" @@ -579,6 +692,12 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" +[[package]] +name = "rle-decode-fast" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" + [[package]] name = "rmp" version = "0.8.14" @@ -719,6 +838,21 @@ dependencies = [ "libc", ] +[[package]] +name = "simd-adler32" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" + +[[package]] +name = "simple-archive" +version = "0.1.0" +dependencies = [ + "flate2", + "libflate", + "zopfli", +] + [[package]] name = "simplelog" version = "0.12.2" @@ -956,6 +1090,12 @@ version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -1160,3 +1300,37 @@ dependencies = [ "linux-raw-sys", "rustix", ] + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zopfli" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5019f391bac5cf252e93bbcc53d039ffd62c7bfb7c150414d61369afe57e946" +dependencies = [ + "bumpalo", + "crc32fast", + "lockfree-object-pool", + "log", + "once_cell", + "simd-adler32", +] diff --git a/Cargo.toml b/Cargo.toml index 81f39e0..a49199d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,6 +3,7 @@ resolver = "2" members = [ "scheduler", "filevec", + "simple-archive" ] [workspace.lints.clippy] diff --git a/simple-archive/Cargo.toml b/simple-archive/Cargo.toml new file mode 100644 index 0000000..3379487 --- /dev/null +++ b/simple-archive/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "simple-archive" +version = "0.1.0" +edition = "2021" + +[dependencies] +flate2 = "1.0.33" +libflate = "2.1.0" +zopfli = "0.8.1" + +[lints] +workspace = true diff --git a/simple-archive/src/lib.rs b/simple-archive/src/lib.rs new file mode 100644 index 0000000..70a9748 --- /dev/null +++ b/simple-archive/src/lib.rs @@ -0,0 +1,205 @@ +use std::io::{ErrorKind, Read, Write}; +use zopfli::{Format, Options}; + +pub struct Writer(T); + +#[derive(Debug, Clone, Copy)] +pub enum Compression { + None, + Zopfli, +} + +impl Writer { + pub fn new(target: T) -> Self { + Self(target) + } + + pub fn into_inner(self) -> T { + self.0 + } + + pub fn append_data( + &mut self, + path: &str, + data: &[u8], + compression: Compression, + ) -> std::io::Result<()> { + let path_len: u8 = + try_into_io_result(path.len(), "path must not be longer than 255 chars")?; + self.0.write_all(&path_len.to_le_bytes())?; + self.0.write_all(path.as_bytes())?; + + match compression { + Compression::None => self.write_data(data), + Compression::Zopfli => { + let mut buffer = vec![]; + zopfli::compress(Options::default(), Format::Gzip, data, &mut buffer)?; + self.write_data(&buffer) + } + } + } + + fn write_data(&mut self, data: &[u8]) -> std::io::Result<()> { + let data_len: u32 = + try_into_io_result(data.len(), "data must not be longer than u32::MAX")?; + self.0.write_all(&data_len.to_le_bytes())?; + self.0.write_all(data)?; + Ok(()) + } + + pub fn append_file(&mut self, path: &str, compression: Compression) -> std::io::Result<()> { + let data = std::fs::read(path)?; + self.append_data(path, &data, compression) + } +} + +fn try_into_io_result, U>(val: T, other_msg: &str) -> std::io::Result { + val.try_into().map_err(|_| std::io::Error::other(other_msg)) +} + +pub struct Reader(T); + +impl Reader { + pub fn new(reader: T) -> Self { + Self(reader) + } + + pub fn into_inner(self) -> T { + self.0 + } + + fn next_entry(&mut self) -> std::io::Result { + let mut path_len = [0; 1]; + self.0.read_exact(&mut path_len)?; + + let mut path = vec![0; u8::from_le_bytes(path_len) as usize]; + self.0.read_exact(&mut path)?; + + let mut data_len = [0; 4]; + self.0.read_exact(&mut data_len)?; + + let mut data = vec![0; u32::from_le_bytes(data_len) as usize]; + self.0.read_exact(&mut data)?; + + Ok(Entry { + path: String::from_utf8_lossy(&path).to_string(), + data: Self::try_to_enflate(data), + }) + } + + fn try_to_enflate(data: Vec) -> Vec { + const GZIP_MAGIC_NUMBER: [u8; 2] = [0x1f, 0x8b]; + if !data.starts_with(&GZIP_MAGIC_NUMBER) { + return data; + } + + let mut decoder = flate2::read::GzDecoder::new(&data[..]); + let mut result = vec![]; + if decoder.read_to_end(&mut result).is_ok() { + result + } else { + drop(decoder); + data + } + } +} + +impl Iterator for Reader { + type Item = std::io::Result; + + fn next(&mut self) -> Option { + match self.next_entry() { + Err(ref e) if e.kind() == ErrorKind::UnexpectedEof => None, + r => Some(r), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Entry { + pub path: String, + pub data: Vec, +} + +#[cfg(test)] +mod tests { + use std::{ + io::Cursor, + process::{Command, Stdio}, + }; + + use super::*; + + #[test] + fn data_is_encoded_correctly() { + let mut res = dummy(); + + res.append_data("abc", &[1, 2, 3, 4], Compression::None).unwrap(); + + assert_eq!( + res.into_inner().into_inner(), + vec![3, b'a', b'b', b'c', 4, 0, 0, 0, 1, 2, 3, 4] + ); + } + + #[test] + fn path_longer_than_255_is_rejected() { + let mut res = dummy(); + + let err = res.append_data(&"a".repeat(256), &[], Compression::None).unwrap_err(); + + assert_eq!(err.kind(), std::io::ErrorKind::Other); + } + + #[test] + fn data_longer_than_u32_max_is_rejected() { + let mut res = dummy(); + + let err = + res.append_data("abc", &vec![0; u32::MAX as usize + 1], Compression::None).unwrap_err(); + + assert_eq!(err.kind(), std::io::ErrorKind::Other); + } + + #[test] + fn data_is_compressed() { + let mut res = dummy(); + + res.append_data("abc", &[0; 512], Compression::Zopfli).unwrap(); + + let res = res.into_inner().into_inner(); + assert!(res.len() < 100); + assert_eq!(u32::from_le_bytes(res[4..8].try_into().unwrap()) as usize, res.len() - 8); + + let mut zcat = + Command::new("zcat").stdin(Stdio::piped()).stdout(Stdio::piped()).spawn().unwrap(); + zcat.stdin.take().unwrap().write_all(&res[8..]).unwrap(); + let decompressed = zcat.wait_with_output().unwrap().stdout; + + assert_eq!(decompressed, vec![0; 512]); + } + + #[test] + fn can_decompress() { + let mut data = dummy(); + + data.append_data("abc", &[1, 2, 3, 4, 5], Compression::Zopfli).unwrap(); + data.append_data("def", &[1, 2], Compression::None).unwrap(); + + let mut data = data.into_inner(); + data.set_position(0); + let mut decoder = Reader::new(data); + + let first = decoder.next().unwrap().unwrap(); + assert_eq!(first.path, "abc"); + assert_eq!(first.data, vec![1, 2, 3, 4, 5]); + let second = decoder.next().unwrap().unwrap(); + assert_eq!(second.path, "def"); + assert_eq!(second.data, vec![1, 2]); + assert!(decoder.next().is_none()); + } + + fn dummy() -> Writer>> { + Writer::new(Cursor::new(vec![])) + } +}