Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New read api #233

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@ default = [
name = "read_entry"
harness = false

[[bench]]
name = "decompress_sink"
harness = false

[[bench]]
name = "read_metadata"
harness = false
Expand Down
119 changes: 119 additions & 0 deletions benches/decompress_sink.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#![cfg(all(feature = "_deflate-any", feature = "bzip2"))]

use bencher::{benchmark_group, benchmark_main};

use std::fs;
use std::io::{self, prelude::*};
use std::path::Path;

use bencher::Bencher;

use zip::read::stream::{ZipStreamFileMetadata, ZipStreamReader, ZipStreamVisitor};
use zip::read::ZipFile;
use zip::unstable::read::streaming::StreamingArchive;
use zip::{result::ZipResult, ZipArchive};

/* This contains the compressed text of King Lear from Project Gutenberg, in the public domain.
* It contains the text of King Lear in multiple formats:
* 1. Stored (uncompressed)
* 2. Deflated (compresslevel=9)
* 3. Bzip2 (compresslevel=9)
* It contains 50 of each, with 150 entries total. This is generated by the script
* tests/data/generate-king-lear-zip.py.
*/
fn get_test_data() -> ZipResult<ZipArchive<fs::File>> {
let path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/data/king-lear-compressed.zip");
let file = fs::File::open(path)?;
ZipArchive::new(file)
}

fn write_entry_to_sink_generic(bench: &mut Bencher) {
let mut archive = get_test_data().unwrap();
let total_size: u64 = archive.decompressed_size().unwrap().try_into().unwrap();

bench.bytes = total_size;
bench.bench_n(1, |bench| {
bench.iter(|| {
for i in 0..archive.len() {
let mut f = archive.by_index_generic(i).unwrap();
io::copy(&mut f, &mut io::sink()).unwrap();
}
})
});
}

fn write_entry_to_sink_standard(bench: &mut Bencher) {
let mut archive = get_test_data().unwrap();
let total_size: u64 = archive.decompressed_size().unwrap().try_into().unwrap();

bench.bytes = total_size;
bench.bench_n(1, |bench| {
bench.iter(|| {
for i in 0..archive.len() {
let mut f = archive.by_index(i).unwrap();
io::copy(&mut f, &mut io::sink()).unwrap();
}
})
});
}

fn write_stream_to_sink_generic(bench: &mut Bencher) {
let archive = get_test_data().unwrap();
let total_size: u64 = archive.decompressed_size().unwrap().try_into().unwrap();

let mut reader = archive.into_inner();

bench.bytes = total_size;
bench.bench_n(1, |bench| {
bench.iter(|| {
reader.rewind().unwrap();
let mut stream_zip = StreamingArchive::new(&mut reader);

while let Some(mut file) = stream_zip.next_entry().unwrap() {
io::copy(&mut file, &mut io::sink()).unwrap();
}
while stream_zip.next_metadata_entry().unwrap().is_some() {}
})
});
}

fn write_stream_to_sink_standard(bench: &mut Bencher) {
let archive = get_test_data().unwrap();
let total_size: u64 = archive.decompressed_size().unwrap().try_into().unwrap();

struct V;
impl ZipStreamVisitor for V {
fn visit_file(&mut self, file: &mut ZipFile) -> ZipResult<()> {
io::copy(file, &mut io::sink())?;
Ok(())
}
fn visit_additional_metadata(
&mut self,
_metadata: &ZipStreamFileMetadata,
) -> ZipResult<()> {
Ok(())
}
}

let mut reader = archive.into_inner();

bench.bytes = total_size;
bench.bench_n(1, |bench| {
bench.iter(|| {
reader.rewind().unwrap();
let stream_zip = ZipStreamReader::new(&mut reader);

stream_zip.visit(&mut V).unwrap();
})
});
}

benchmark_group!(
benches,
write_entry_to_sink_generic,
write_entry_to_sink_standard,
write_stream_to_sink_generic,
write_stream_to_sink_standard,
);

benchmark_main!(benches);

Check failure on line 119 in benches/decompress_sink.rs

View workflow job for this annotation

GitHub Actions / style_and_docs (--no-default-features)

`main` function not found in crate `decompress_sink`
8 changes: 5 additions & 3 deletions benches/read_entry.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
use bencher::{benchmark_group, benchmark_main};

use std::io::{Cursor, Read, Write};
use std::io::{prelude::*, Cursor};

use bencher::Bencher;
use getrandom::getrandom;
use zip::{write::SimpleFileOptions, ZipArchive, ZipWriter};

use zip::{write::SimpleFileOptions, CompressionMethod, ZipArchive, ZipWriter};

fn generate_random_archive(size: usize) -> Vec<u8> {
let data = Vec::new();
let mut writer = ZipWriter::new(Cursor::new(data));
let options = SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored);
let options = SimpleFileOptions::default().compression_method(CompressionMethod::Stored);

writer.start_file("random.dat", options).unwrap();
let mut bytes = vec![0u8; size];
Expand Down Expand Up @@ -39,4 +40,5 @@ fn read_entry(bench: &mut Bencher) {
}

benchmark_group!(benches, read_entry);

benchmark_main!(benches);
28 changes: 26 additions & 2 deletions fuzz/fuzz_targets/fuzz_read.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
#![no_main]

use libfuzzer_sys::fuzz_target;
use std::io::{Read, Seek, SeekFrom};
use tikv_jemallocator::Jemalloc;

use std::io::prelude::*;

use zip::read::read_zipfile_from_stream;
use zip::unstable::read::streaming::StreamingArchive;

const MAX_BYTES_TO_READ: u64 = 1 << 24;

Expand All @@ -19,13 +22,34 @@ fn decompress_all(data: &[u8]) -> Result<(), Box<dyn std::error::Error>> {
std::io::copy(&mut file, &mut std::io::sink())?;
}
let mut reader = zip.into_inner();
reader.seek(SeekFrom::Start(0))?;
reader.rewind()?;
while let Ok(Some(mut file)) = read_zipfile_from_stream(&mut reader) {
std::io::copy(&mut file, &mut std::io::sink())?;
}
Ok(())
}

fn decompress_generic(data: &[u8]) -> Result<(), Box<dyn std::error::Error>> {
let reader = std::io::Cursor::new(data);
let mut zip = zip::ZipArchive::new(reader)?;

for i in 0..zip.len() {
let mut file = zip.by_index_generic(i)?.take(MAX_BYTES_TO_READ);
std::io::copy(&mut file, &mut std::io::sink())?;
}

let mut reader = zip.into_inner();
reader.rewind()?;
let mut stream_zip = StreamingArchive::new(reader);

while let Some(mut file) = stream_zip.next_entry()? {
std::io::copy(&mut file, &mut std::io::sink())?;
}
while let Some(_) = stream_zip.next_metadata_entry()? {}
Ok(())
}

fuzz_target!(|data: &[u8]| {
let _ = decompress_all(data);
let _ = decompress_generic(data);
});
8 changes: 5 additions & 3 deletions src/aes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ pub struct AesReader<R> {
data_length: u64,
}

impl<R: Read> AesReader<R> {
impl<R> AesReader<R> {
pub const fn new(reader: R, aes_mode: AesMode, compressed_size: u64) -> AesReader<R> {
let data_length = compressed_size
- (PWD_VERIFY_LENGTH + AUTH_CODE_LENGTH + aes_mode.salt_length()) as u64;
Expand All @@ -77,7 +77,9 @@ impl<R: Read> AesReader<R> {
data_length,
}
}
}

impl<R: Read> AesReader<R> {
/// Read the AES header bytes and validate the password.
///
/// Even if the validation succeeds, there is still a 1 in 65536 chance that an incorrect
Expand Down Expand Up @@ -150,7 +152,7 @@ impl<R: Read> AesReader<R> {
/// There is a 1 in 65536 chance that an invalid password passes that check.
/// After the data has been read and decrypted an HMAC will be checked and provide a final means
/// to check if either the password is invalid or if the data has been changed.
pub struct AesReaderValid<R: Read> {
pub struct AesReaderValid<R> {
reader: R,
data_remaining: u64,
cipher: Cipher,
Expand Down Expand Up @@ -214,7 +216,7 @@ impl<R: Read> Read for AesReaderValid<R> {
}
}

impl<R: Read> AesReaderValid<R> {
impl<R> AesReaderValid<R> {
/// Consumes this decoder, returning the underlying reader.
pub fn into_inner(self) -> R {
self.reader
Expand Down
104 changes: 104 additions & 0 deletions src/crc32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,110 @@ impl<R: Read> Read for Crc32Reader<R> {
}
}

pub(crate) mod non_crypto {
use std::io;
use std::io::prelude::*;

use crc32fast::Hasher;

/// Reader that validates the CRC32 when it reaches the EOF.
pub struct Crc32Reader<R> {
inner: R,
hasher: Hasher,
check: u32,
}

impl<R> Crc32Reader<R> {
/// Get a new Crc32Reader which checks the inner reader against checksum.
pub(crate) fn new(inner: R, checksum: u32) -> Self {
Crc32Reader {
inner,
hasher: Hasher::new(),
check: checksum,
}
}

fn check_matches(&self) -> Result<(), &'static str> {
let res = self.hasher.clone().finalize();
if self.check == res {
Ok(())
} else {
/* TODO: make this into our own Crc32Error error type! */
Dismissed Show dismissed Hide dismissed
Err("Invalid checksum")
}
}
}

impl<R: Read> Read for Crc32Reader<R> {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
/* We want to make sure we only check the hash when the input stream is exhausted. */
if buf.is_empty() {
/* If the input buf is empty (this shouldn't happen, but isn't guaranteed), we
* still want to "pull" from the source in case it surfaces an i/o error. This will
* always return a count of Ok(0) if successful. */
return self.inner.read(buf);
}

let count = self.inner.read(buf)?;
if count == 0 {
return self
.check_matches()
.map(|()| 0)
/* TODO: use io::Error::other for MSRV >=1.74 */
.map_err(|e| io::Error::new(io::ErrorKind::Other, e));
}
self.hasher.update(&buf[..count]);
Ok(count)
}
}

#[cfg(test)]
mod test {
use super::*;

#[test]
fn test_empty_reader() {
let data: &[u8] = b"";
let mut buf = [0; 1];

let mut reader = Crc32Reader::new(data, 0);
assert_eq!(reader.read(&mut buf).unwrap(), 0);

let mut reader = Crc32Reader::new(data, 1);
assert!(reader
.read(&mut buf)
.unwrap_err()
.to_string()
.contains("Invalid checksum"));
}

#[test]
fn test_byte_by_byte() {
let data: &[u8] = b"1234";
let mut buf = [0; 1];

let mut reader = Crc32Reader::new(data, 0x9be3e0a3);
assert_eq!(reader.read(&mut buf).unwrap(), 1);
assert_eq!(reader.read(&mut buf).unwrap(), 1);
assert_eq!(reader.read(&mut buf).unwrap(), 1);
assert_eq!(reader.read(&mut buf).unwrap(), 1);
assert_eq!(reader.read(&mut buf).unwrap(), 0);
// Can keep reading 0 bytes after the end
assert_eq!(reader.read(&mut buf).unwrap(), 0);
}

#[test]
fn test_zero_read() {
let data: &[u8] = b"1234";
let mut buf = [0; 5];

let mut reader = Crc32Reader::new(data, 0x9be3e0a3);
assert_eq!(reader.read(&mut buf[..0]).unwrap(), 0);
assert_eq!(reader.read(&mut buf).unwrap(), 4);
}
}
}

#[cfg(test)]
mod test {
use super::*;
Expand Down
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@
#![warn(missing_docs)]
#![allow(unexpected_cfgs)] // Needed for cfg(fuzzing) on nightly as of 2024-05-06
pub use crate::compression::{CompressionMethod, SUPPORTED_COMPRESSION_METHODS};
pub use crate::read::HasZipMetadata;
pub use crate::read::ZipArchive;
pub use crate::read::{EntryData, HasZipMetadata};
pub use crate::spec::{ZIP64_BYTES_THR, ZIP64_ENTRY_THR};
pub use crate::types::{AesMode, DateTime};
pub use crate::write::ZipWriter;
Expand Down
Loading
Loading