diff --git a/Cargo.lock b/Cargo.lock index f2758b59..6bec8181 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -180,6 +180,15 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "597bb81c80a54b6a4381b23faba8d7774b144c94cbd1d6fe3f1329bd776554ab" +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bindgen" version = "0.65.1" @@ -1677,8 +1686,6 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae9e413cb7387bbb4405e960920e5d8c5f255ec4a86f021a18a455014565e749" dependencies = [ "az", "byteorder", @@ -1728,6 +1735,7 @@ dependencies = [ "anyhow", "assert_cmd", "assert_matches", + "bincode", "camino", "csv", "env_logger", diff --git a/Cargo.toml b/Cargo.toml index edbfb094..10388139 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,8 @@ crate-type = ["cdylib"] pyo3 = { version = "0.20.3", features = ["extension-module", "anyhow"] } rayon = "1.8.1" serde = { version = "1.0.196", features = ["derive"] } -sourmash = { version = "0.13.0", features = ["branchwater"] } +#sourmash = { version = "0.13.0", features = ["branchwater"] } +sourmash = { path="../sourmash/src/core", features = ["branchwater"] } serde_json = "1.0.113" niffler = "2.4.0" log = "0.4.14" @@ -26,6 +27,7 @@ csv = "1.3.0" camino = "1.1.6" glob = "0.3.1" rustworkx-core = "0.14.0" +bincode = "1.3.3" [dev-dependencies] assert_cmd = "2.0.14" diff --git a/src/lib.rs b/src/lib.rs index f4647156..e5effbba 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -270,8 +270,9 @@ fn do_manysketch( output: String, singleton: bool, force: bool, + use_bincode: bool, ) -> anyhow::Result { - match manysketch::manysketch(filelist, param_str, output, singleton, force) { + match manysketch::manysketch(filelist, param_str, output, singleton, force, use_bincode) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); diff --git a/src/manysketch.rs b/src/manysketch.rs index a23e0f36..0f9e244e 100644 --- a/src/manysketch.rs +++ b/src/manysketch.rs @@ -119,6 +119,7 @@ pub fn manysketch( output: String, singleton: bool, force: bool, + use_bincode: bool, ) -> Result<(), Box> { let (fileinfo, n_fastas) = match load_fasta_fromfile(filelist, force) { Ok((file_info, n_fastas)) => (file_info, n_fastas), @@ -144,7 +145,7 @@ pub fn manysketch( let send = std::sync::Arc::new(send); // & spawn a thread that is dedicated to printing to a buffered output - let thrd = sigwriter(recv, output); + let thrd = sigwriter(recv, output, use_bincode); // parse param string into params_vec, print error if fail let param_result = parse_params_str(param_str); diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index 39dd6b59..c9eea8b2 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -345,6 +345,8 @@ def __init__(self, p): help='build one sketch per FASTA record, i.e. multiple sketches per FASTA file') p.add_argument('-f', '--force', action="store_true", help='allow use of individual FASTA files in more than more sketch') + p.add_argument('-b', '--use-bincode', action="store_true", + help='serialize signatures using bincode.') def main(self, args): print_version() @@ -366,7 +368,8 @@ def main(self, args): args.param_string, args.output, args.singleton, - args.force) + args.force, + args.use_bincode) if status == 0: notify(f"...manysketch is done! results in '{args.output}'") return status diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_multigather.py index 486d023c..35afb2da 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_multigather.py @@ -976,7 +976,7 @@ def test_indexed_full_output(runtmp): f_unique_weighted = set(df['f_unique_weighted']) f_unique_weighted = set([round(x, 4) for x in f_unique_weighted]) - assert f_unique_weighted == {0.0063, 0.002, 0.0062} + assert f_unique_weighted == {0.0063, 0.0062, 0.0062} unique_intersect_bp = set(df['unique_intersect_bp']) unique_intersect_bp = set([round(x,4) for x in unique_intersect_bp]) diff --git a/src/python/tests/test_sketch.py b/src/python/tests/test_sketch.py index ecfae2a7..797ef4c0 100644 --- a/src/python/tests/test_sketch.py +++ b/src/python/tests/test_sketch.py @@ -786,3 +786,27 @@ def test_manysketch_prefix_duplicated_force(runtmp, capfd): print(sigs) assert len(sigs) == 3 + + +def test_manysketch_simple_bincode(runtmp): + fa_csv = runtmp.output('db-fa.txt') + + fa1 = get_test_data('short.fa') + fa2 = get_test_data('short2.fa') + fa3 = get_test_data('short3.fa') + + make_assembly_csv(fa_csv, [fa1, fa2, fa3]) + + output = runtmp.output('db.zip') + + runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, + '--param-str', "dna,k=31,scaled=1", '--use-bincode') + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + print(sigs) + + assert len(sigs) == 3 \ No newline at end of file diff --git a/src/utils.rs b/src/utils.rs index 4c1bb912..792d9637 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -17,6 +17,7 @@ use std::panic; use std::sync::atomic; use std::sync::atomic::AtomicUsize; +use bincode::{deserialize_from, serialize_into, Error}; use sourmash::collection::Collection; use sourmash::manifest::{Manifest, Record}; use sourmash::selection::Selection; @@ -24,6 +25,7 @@ use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::minhash::KmerMinHash; use sourmash::storage::{FSStorage, InnerStorage, SigStore}; use std::collections::{HashMap, HashSet}; + /// Track a name/minhash. pub struct SmallSignature { @@ -1054,6 +1056,7 @@ pub enum ZipMessage { pub fn sigwriter( recv: std::sync::mpsc::Receiver, output: String, + use_bincode: bool, ) -> std::thread::JoinHandle> { std::thread::spawn(move || -> Result<()> { // cast output as pathbuf @@ -1081,7 +1084,13 @@ pub fn sigwriter( } else { format!("signatures/{}.sig.gz", md5sum_str) }; - write_signature(sig, &mut zip, options, &sig_filename); + if use_bincode { + serialize_signature(sig, &mut zip, options, &sig_filename) + .context("failed to serialize signature."); + eprintln!("SERIALIZING USING BINCODE") + } else { + write_signature(sig, &mut zip, options, &sig_filename); + } let records: Vec = Record::from_sig(sig, sig_filename.as_str()); manifest_rows.extend(records); } @@ -1149,3 +1158,32 @@ pub fn write_signature( zip.start_file(sig_filename, zip_options).unwrap(); zip.write_all(&gzipped_buffer).unwrap(); } + +fn serialize_signature( + sig: &Signature, + zip: &mut zip::ZipWriter>, + zip_options: zip::write::FileOptions, + sig_filename: &str, +) -> Result<()> { + // Serialize the signature using Bincode + let mut buffer = Vec::new(); + bincode::serialize_into(&mut buffer, &sig)?; + + // Write the serialized data to the zip file + zip.start_file(sig_filename, zip_options)?; + zip.write_all(&buffer)?; + + Ok(()) +} +// fn write_manifest_to_zip( +// manifest_rows: &[Record], +// zip: &mut zip::ZipWriter>, +// ) -> Result<(), Box> { +// // Write manifest rows to CSV format +// for record in manifest_rows { +// let csv_row = record.to_csv_row(); +// zip.write_all(csv_row.as_bytes())?; +// } + +// Ok(()) +// }