From f1edc2c55e1638fbae2fa4c676f277408ecc9fd7 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 27 Sep 2023 13:04:30 -0700 Subject: [PATCH 01/47] init changes --- Cargo.lock | 178 +++++++++++++++++++++------------------------------ Cargo.toml | 3 +- src/index.rs | 60 +++++++++++++---- 3 files changed, 122 insertions(+), 119 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 11246cb2..0e453e9e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,12 +2,6 @@ # It is not intended for manual editing. version = 3 -[[package]] -name = "Inflector" -version = "0.11.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe438c63458706e03479442743baae6c88256498e6431708f6dfc520a26515d3" - [[package]] name = "adler" version = "1.0.2" @@ -127,6 +121,12 @@ dependencies = [ "thiserror", ] +[[package]] +name = "binary-merge" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597bb81c80a54b6a4381b23faba8d7774b144c94cbd1d6fe3f1329bd776554ab" + [[package]] name = "bindgen" version = "0.65.1" @@ -269,6 +269,15 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "camino" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c" +dependencies = [ + "serde", +] + [[package]] name = "cc" version = "1.0.83" @@ -477,6 +486,18 @@ version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" +[[package]] +name = "enum_dispatch" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f33313078bb8d4d05a2733a94ac4c2d8a0df9a2b84424ebf4f33bfc224a890e" +dependencies = [ + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.29", +] + [[package]] name = "env_logger" version = "0.10.0" @@ -542,37 +563,12 @@ dependencies = [ "num-traits", ] -[[package]] -name = "flume" -version = "0.10.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1657b4441c3403d9f7b3409e47575237dac27b1b5726df654a6ecbf92f0f7577" -dependencies = [ - "futures-core", - "futures-sink", - "nanorand", - "pin-project", - "spin", -] - [[package]] name = "funty" version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" -[[package]] -name = "futures-core" -version = "0.3.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" - -[[package]] -name = "futures-sink" -version = "0.3.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" - [[package]] name = "generic-array" version = "0.14.7" @@ -623,6 +619,12 @@ dependencies = [ "ahash", ] +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + [[package]] name = "hermit-abi" version = "0.3.2" @@ -688,6 +690,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "inplace-vec-builder" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf64c2edc8226891a71f127587a2861b132d2b942310843814d5001d99a1d307" +dependencies = [ + "smallvec", +] + [[package]] name = "is-terminal" version = "0.4.9" @@ -844,9 +855,9 @@ checksum = "5486aed0026218e61b8a01d5fbd5a0a134649abb71a0e53b7bc088529dced86e" [[package]] name = "memmap2" -version = "0.5.10" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" +checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6" dependencies = [ "libc", ] @@ -881,15 +892,6 @@ version = "0.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2983372caf4480544083767bf2d27defafe32af49ab4df3a0b7fc90793a3664" -[[package]] -name = "nanorand" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3" -dependencies = [ - "getrandom", -] - [[package]] name = "needletail" version = "0.5.1" @@ -988,25 +990,27 @@ checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "ouroboros" -version = "0.15.6" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1358bd1558bd2a083fed428ffeda486fbfb323e698cdda7794259d592ca72db" +checksum = "1c86de06555b970aec45229b27291b53154f21a5743a163419f4e4c0b065dcde" dependencies = [ "aliasable", "ouroboros_macro", + "static_assertions", ] [[package]] name = "ouroboros_macro" -version = "0.15.6" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f7d21ccd03305a674437ee1248f3ab5d4b1db095cf1caf49f1713ddf61956b7" +checksum = "8cad0c4b129e9696e37cb712b243777b90ef489a0bfaa0ac34e7d9b860e4f134" dependencies = [ - "Inflector", + "heck", + "itertools", "proc-macro-error", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.29", ] [[package]] @@ -1061,39 +1065,20 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" -[[package]] -name = "pin-project" -version = "1.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fda4ed1c6c173e3fc7a83629421152e01d7b1f9b7f65fb301e490e8cfc656422" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.29", -] - [[package]] name = "piz" -version = "0.4.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58c75d1c00e6d407e283cc66d9d4fd0985ef1703c761520845b93c4f981bfb65" +checksum = "898b071c1938a2c92b95c18708cbf38f2566a01f0ab9dd7bdf4329987e5c2e17" dependencies = [ + "camino", "chrono", "codepage-437", "crc32fast", "flate2", "log", + "memchr", "thiserror", - "twoway", ] [[package]] @@ -1409,9 +1394,9 @@ dependencies = [ [[package]] name = "retain_mut" -version = "0.1.9" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4389f1d5789befaf6029ebd9f7dac4af7f7e3d61b69d4f30e2ac02b57e7712b0" +checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086" [[package]] name = "rkyv" @@ -1443,9 +1428,9 @@ dependencies = [ [[package]] name = "roaring" -version = "0.9.0" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd539cab4e32019956fe7e0cf160bb6d4802f4be2b52c4253d76d3bb0f85a5f7" +checksum = "6106b5cf8587f5834158895e9715a3c6c9716c8aefab57f1f7680917191c7873" dependencies = [ "bytemuck", "byteorder", @@ -1603,15 +1588,18 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.12.0" -source = "git+https://github.com/sourmash-bio/sourmash?rev=ff1092f8f366339caa59d7203f623813228f4356#ff1092f8f366339caa59d7203f623813228f4356" dependencies = [ "az", "bytecount", "byteorder", + "camino", "cfg-if", + "chrono", "counter", + "csv", + "enum_dispatch", "fixedbitset", - "flume", + "getrandom", "getset", "histogram", "log", @@ -1641,15 +1629,6 @@ dependencies = [ "web-sys", ] -[[package]] -name = "spin" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" -dependencies = [ - "lock_api", -] - [[package]] name = "static_assertions" version = "1.1.0" @@ -1787,16 +1766,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" -[[package]] -name = "twoway" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c57ffb460d7c24cd6eda43694110189030a3d1dfe418416d9468fd1c1d290b47" -dependencies = [ - "memchr", - "unchecked-index", -] - [[package]] name = "twox-hash" version = "1.6.3" @@ -1810,9 +1779,9 @@ dependencies = [ [[package]] name = "typed-builder" -version = "0.10.0" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89851716b67b937e393b3daa8423e67ddfc4bbbf1654bcf05488e95e0828db0c" +checksum = "64cba322cb9b7bc6ca048de49e83918223f35e7a86311267013afff257004870" dependencies = [ "proc-macro2", "quote", @@ -1825,12 +1794,6 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" -[[package]] -name = "unchecked-index" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eeba86d422ce181a719445e51872fa30f1f7413b62becb52e95ec91aa262d85c" - [[package]] name = "unicode-ident" version = "1.0.11" @@ -1857,10 +1820,13 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "vec-collections" -version = "0.3.6" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f2390c4dc8ae8640c57d067b1a3d40bc05c124cc6bc7394d761b53435d41b76" +checksum = "3c9965c8f2ffed1dbcd16cafe18a009642f540fa22661c6cfd6309ddb02e4982" dependencies = [ + "binary-merge", + "inplace-vec-builder", + "lazy_static", "num-traits", "serde", "smallvec", diff --git a/Cargo.toml b/Cargo.toml index 8d1938f2..c67fc419 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,8 @@ crate-type = ["cdylib"] pyo3 = { version = "0.19.2", features = ["extension-module", "anyhow"] } rayon = "1.8.0" serde = { version = "1.0.136", features = ["derive"] } -sourmash = { git = "https://github.com/sourmash-bio/sourmash", "rev" = "ff1092f8f366339caa59d7203f623813228f4356" } +#sourmash = { git = "https://github.com/sourmash-bio/sourmash", "rev" = "ff1092f8f366339caa59d7203f623813228f4356" } +sourmash = { path = "../sourmash/src/core", features = ["branchwater"] } serde_json = "1.0.107" niffler = "2.4.0" log = "0.4.14" diff --git a/src/index.rs b/src/index.rs index bee725cd..21f61162 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,4 +1,10 @@ -use sourmash::index::revindex::RevIndex; +//use sourmash::index::revindex::RevIndex; +use sourmash::collection::Collection; +use sourmash::index::revindex::{prepare_query, RevIndex, RevIndexOps}; +use sourmash::manifest::Manifest; +use sourmash::prelude::*; +use sourmash::signature::{Signature, SigsTrait}; +use sourmash::storage::{FSStorage, InnerStorage, ZipStorage}; use sourmash::sketch::Sketch; use std::path::Path; @@ -6,26 +12,56 @@ use crate::utils::{load_sigpaths_from_zip_or_pathlist, ReportType}; pub fn index>( siglist: P, - template: Sketch, + template: Sketch, + manifest: Option

, + selection: Selection, output: P, save_paths: bool, colors: bool, ) -> Result<(), Box> { println!("Loading siglist"); - let (index_sigs, _temp_dir) = - load_sigpaths_from_zip_or_pathlist(&siglist, &template, ReportType::Index)?; + // let (index_sigs, _temp_dir) = load_sigpaths_from_zip_or_pathlist(&siglist)?; - // if index_sigs pathlist is empty, bail - if index_sigs.is_empty() { - bail!("No signatures to index loaded, exiting."); - } + // // if index_sigs pathlist is empty, bail + // if index_sigs.is_empty() { + // bail!("No signatures to index loaded, exiting."); + // } - // Create or open the RevIndex database with the provided output path and colors flag - let db = RevIndex::create(output.as_ref(), colors); + // // Create or open the RevIndex database with the provided output path and colors flag + // let db = RevIndex::create(output.as_ref(), colors); - // Index the signatures using the loaded template, threshold, and save_paths option - db.index(index_sigs, &template, 0.0, save_paths); + // // Index the signatures using the loaded template, threshold, and save_paths option + // db.index(index_sigs, &template, 0.0, save_paths); + + let manifest = if let Some(m) = manifest { + let rdr = std::fs::OpenOptions::new().read(true).open(m.as_ref())?; + Some(Manifest::from_reader(rdr)?) + } else { + None + }; + + let collection = if matches!(siglist.as_ref().extension(), Some("zip")) { + if let Some(m) = manifest { + let storage = ZipStorage::from_file(siglist)?; + Collection::new(m, InnerStorage::new(storage)) + } else { + Collection::from_zipfile(siglist)? + } + } else { + let manifest = manifest.ok_or_else(|| "Need a manifest")?; + let storage = FSStorage::builder() + .fullpath("".into()) + .subdir("".into()) + .build(); + Collection::new(manifest, InnerStorage::new(storage)) + }; + + RevIndex::create( + output.as_ref(), + collection.select(&selection)?.try_into()?, + colors, + )?; Ok(()) } From cb702b3842cd90498cc540126db99f3bbe6a97b8 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 27 Sep 2023 15:53:42 -0700 Subject: [PATCH 02/47] compiling code using newer mastiff branch --- Cargo.lock | 1 + Cargo.toml | 1 + src/check.rs | 5 +++-- src/index.rs | 14 ++++++++------ src/lib.rs | 34 +++++++++++++++++++++++++++++++++- src/mastiff_manygather.rs | 21 ++++++++++++++++++--- src/mastiff_manysearch.rs | 11 +++++++++-- src/utils.rs | 12 ++++++------ 8 files changed, 79 insertions(+), 20 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0e453e9e..e802d28d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1221,6 +1221,7 @@ dependencies = [ "anyhow", "assert_cmd", "assert_matches", + "camino", "csv", "env_logger", "log", diff --git a/Cargo.toml b/Cargo.toml index c67fc419..f96dbfd8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ zip = "0.6" tempfile = "3.8" needletail = "0.5.1" csv = "1.2.2" +camino = "1.1.6" [dev-dependencies] assert_cmd = "2.0.4" diff --git a/src/check.rs b/src/check.rs index 3b6484ee..7df0ca2a 100644 --- a/src/check.rs +++ b/src/check.rs @@ -2,7 +2,8 @@ use std::path::Path; use crate::utils::is_revindex_database; -use sourmash::index::revindex::RevIndex; +use sourmash::index::revindex::{RevIndex, RevIndexOps}; + pub fn check>(index: P, quick: bool) -> Result<(), Box> { if !is_revindex_database(index.as_ref()) { @@ -13,7 +14,7 @@ pub fn check>(index: P, quick: bool) -> Result<(), Box>( - siglist: P, - template: Sketch, + siglist: PathBuf, + // template: Sketch, manifest: Option

, selection: Selection, output: P, @@ -41,7 +43,7 @@ pub fn index>( None }; - let collection = if matches!(siglist.as_ref().extension(), Some("zip")) { + let collection = if matches!(&siglist.extension(), Some("zip")) { if let Some(m) = manifest { let storage = ZipStorage::from_file(siglist)?; Collection::new(m, InnerStorage::new(storage)) diff --git a/src/lib.rs b/src/lib.rs index e7de2643..b924d2a8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,6 +16,8 @@ mod manysketch; mod mastiff_manygather; mod mastiff_manysearch; mod multisearch; +use sourmash::selection::Selection; +use sourmash::encodings::HashFunctions; #[pyfunction] fn do_manysearch( @@ -103,10 +105,24 @@ fn do_fastmultigather( // if a siglist path is a revindex, run mastiff_manygather. If not, run multigather let template = build_template(ksize, scaled, &moltype); if is_revindex_database(siglist_path.as_ref()) { + // build selection instead of template + let hash_function = match moltype.as_str() { + "dna" => HashFunctions::Murmur64Dna, + "protein" => HashFunctions::Murmur64Protein, + "dayhoff" => HashFunctions::Murmur64Dayhoff, + "hp" => HashFunctions::Murmur64Hp, + _ => panic!("Unknown molecule type: {}", moltype), + }; + let selection = Selection::builder() + .ksize(ksize.into()) + .scaled(scaled as u32) + .moltype(hash_function) + .build(); match mastiff_manygather::mastiff_manygather( query_filenames, siglist_path, template, + selection, threshold_bp, output_path, ) { @@ -160,9 +176,25 @@ fn do_index( save_paths: bool, colors: bool, ) -> anyhow::Result { + let hash_function = match moltype.as_str() { + "dna" => HashFunctions::Murmur64Dna, + "protein" => HashFunctions::Murmur64Protein, + "dayhoff" => HashFunctions::Murmur64Dayhoff, + "hp" => HashFunctions::Murmur64Hp, + _ => panic!("Unknown molecule type: {}", moltype), + }; + let selection = Selection::builder() + .ksize(ksize.into()) + .scaled(scaled as u32) + .moltype(hash_function) + .build(); + // match index::index(siglist, template, output, save_paths, colors) { + // convert siglist to PathBuf // build template from ksize, scaled let template = build_template(ksize, scaled, &moltype); - match index::index(siglist, template, output, save_paths, colors) { + let location = camino::Utf8PathBuf::from(siglist); + let manifest = None; + match index::index(location, manifest, selection, output, save_paths, colors) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 291f6b36..4ac8b5b3 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -6,7 +6,16 @@ use sourmash::signature::Signature; use sourmash::sketch::Sketch; use std::path::Path; -use sourmash::index::revindex::RevIndex; +// use sourmash::collection::Collection; +// use sourmash::selection::Selection;A +use sourmash::prelude::*; +// use sourmash::index::revindex::{prepare_query, RevIndex, RevIndexOps}; +// use sourmash::manifest::Manifest; +// use sourmash::prelude::*; +// use sourmash::signature::{Signature, SigsTrait}; +// use sourmash::storage::{FSStorage, InnerStorage, ZipStorage}; + +use sourmash::index::revindex::{RevIndex, RevIndexOps}; use std::sync::atomic; use std::sync::atomic::AtomicUsize; @@ -22,6 +31,7 @@ pub fn mastiff_manygather>( queries_file: P, index: P, template: Sketch, + selection: Selection, threshold_bp: usize, output: Option

, ) -> Result<(), Box> { @@ -32,7 +42,7 @@ pub fn mastiff_manygather>( ); } // Open database once - let db = RevIndex::open(index.as_ref(), true); + let db = RevIndex::open(index.as_ref(), true)?; println!("Loaded DB"); // Load query paths @@ -89,6 +99,10 @@ pub fn mastiff_manygather>( match Signature::from_path(filename) { Ok(query_sig) => { let location = filename.display().to_string(); + // if let Some(q) = prepare_query(&query_sig, &selection) { + // query = Some(q); + // } + // let query = query.expect("Couldn't find a compatible MinHash"); if let Some(query) = prepare_query(&query_sig, &template, &location) { // let query_size = query.minhash.size() as f64; let threshold = threshold_bp / query.minhash.scaled() as usize; @@ -105,7 +119,8 @@ pub fn mastiff_manygather>( hash_to_color, threshold, &query.minhash, - &template, + // Some(selection.clone()), + None, ); // extract matches from Result diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index 4681a8ef..654c1c17 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -6,7 +6,14 @@ use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::Sketch; use std::path::Path; -use sourmash::index::revindex::RevIndex; +// use sourmash::collection::Collection; +// use sourmash::index::revindex::{prepare_query, RevIndex, RevIndexOps}; +// use sourmash::manifest::Manifest; +// use sourmash::prelude::*; +// use sourmash::storage::{FSStorage, InnerStorage, ZipStorage}; + +// use sourmash::index::revindex::RevIndex; +use sourmash::index::revindex::{RevIndex, RevIndexOps}; use std::sync::atomic; use std::sync::atomic::AtomicUsize; @@ -30,7 +37,7 @@ pub fn mastiff_manysearch>( ); } // Open database once - let db = RevIndex::open(index.as_ref(), true); + let db = RevIndex::open(index.as_ref(), true)?; println!("Loaded DB"); // Load query paths diff --git a/src/utils.rs b/src/utils.rs index 4cd36630..d592543c 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -19,8 +19,8 @@ use anyhow::{anyhow, Result}; use std::cmp::{Ordering, PartialOrd}; -use sourmash::prelude::FracMinHashOps; -use sourmash::prelude::MinHashOps; +// use sourmash::prelude::FracMinHashOps; +// use sourmash::prelude::HashOps; use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; use sourmash::sketch::Sketch; @@ -791,10 +791,10 @@ pub fn consume_query_by_gather + std::fmt::Debug + std::fmt::Disp pub fn build_template(ksize: u8, scaled: usize, moltype: &str) -> Sketch { let hash_function = match moltype { - "dna" => HashFunctions::murmur64_DNA, - "protein" => HashFunctions::murmur64_protein, - "dayhoff" => HashFunctions::murmur64_dayhoff, - "hp" => HashFunctions::murmur64_hp, + "dna" => HashFunctions::Murmur64Dna, + "protein" => HashFunctions::Murmur64Protein, + "dayhoff" => HashFunctions::Murmur64Dayhoff, + "hp" => HashFunctions::Murmur64Hp, _ => panic!("Unknown molecule type: {}", moltype), }; //adjust ksize if not dna From db318bab88c17b33409c3232c61a5a70cd92f8af Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 2 Oct 2023 16:07:43 -0700 Subject: [PATCH 03/47] use selection --- Cargo.toml | 1 + src/mastiff_manygather.rs | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f96dbfd8..3e636aec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,7 @@ pyo3 = { version = "0.19.2", features = ["extension-module", "anyhow"] } rayon = "1.8.0" serde = { version = "1.0.136", features = ["derive"] } #sourmash = { git = "https://github.com/sourmash-bio/sourmash", "rev" = "ff1092f8f366339caa59d7203f623813228f4356" } +#sourmash = { git = "https://github.com/sourmash-bio/sourmash", branch = "lirber/mastiff", features = ["branchwater"] } sourmash = { path = "../sourmash/src/core", features = ["branchwater"] } serde_json = "1.0.107" niffler = "2.4.0" diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 4ac8b5b3..10954d8a 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -119,8 +119,7 @@ pub fn mastiff_manygather>( hash_to_color, threshold, &query.minhash, - // Some(selection.clone()), - None, + Some(selection.clone()), ); // extract matches from Result From 07c8362284429813a41f790fd3a4c31b8a906c12 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sun, 5 Nov 2023 14:13:02 -0800 Subject: [PATCH 04/47] rustfmt --- src/check.rs | 1 - src/index.rs | 4 ++-- src/lib.rs | 20 ++++++++++---------- src/mastiff_manygather.rs | 2 +- 4 files changed, 13 insertions(+), 14 deletions(-) diff --git a/src/check.rs b/src/check.rs index 7df0ca2a..7fea2eca 100644 --- a/src/check.rs +++ b/src/check.rs @@ -4,7 +4,6 @@ use crate::utils::is_revindex_database; use sourmash::index::revindex::{RevIndex, RevIndexOps}; - pub fn check>(index: P, quick: bool) -> Result<(), Box> { if !is_revindex_database(index.as_ref()) { bail!( diff --git a/src/index.rs b/src/index.rs index 7ed445cf..b2c38d63 100644 --- a/src/index.rs +++ b/src/index.rs @@ -7,14 +7,14 @@ use sourmash::prelude::*; // use sourmash::signature::{Signature, SigsTrait}; use sourmash::storage::{FSStorage, InnerStorage, ZipStorage}; // use sourmash::sketch::Sketch; -use std::path::Path; use camino::Utf8PathBuf as PathBuf; +use std::path::Path; use crate::utils::{load_sigpaths_from_zip_or_pathlist, ReportType}; pub fn index>( siglist: PathBuf, - // template: Sketch, + // template: Sketch, manifest: Option

, selection: Selection, output: P, diff --git a/src/lib.rs b/src/lib.rs index b924d2a8..efb47082 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,8 +16,8 @@ mod manysketch; mod mastiff_manygather; mod mastiff_manysearch; mod multisearch; -use sourmash::selection::Selection; use sourmash::encodings::HashFunctions; +use sourmash::selection::Selection; #[pyfunction] fn do_manysearch( @@ -114,10 +114,10 @@ fn do_fastmultigather( _ => panic!("Unknown molecule type: {}", moltype), }; let selection = Selection::builder() - .ksize(ksize.into()) - .scaled(scaled as u32) - .moltype(hash_function) - .build(); + .ksize(ksize.into()) + .scaled(scaled as u32) + .moltype(hash_function) + .build(); match mastiff_manygather::mastiff_manygather( query_filenames, siglist_path, @@ -184,12 +184,12 @@ fn do_index( _ => panic!("Unknown molecule type: {}", moltype), }; let selection = Selection::builder() - .ksize(ksize.into()) - .scaled(scaled as u32) - .moltype(hash_function) - .build(); + .ksize(ksize.into()) + .scaled(scaled as u32) + .moltype(hash_function) + .build(); // match index::index(siglist, template, output, save_paths, colors) { - // convert siglist to PathBuf + // convert siglist to PathBuf // build template from ksize, scaled let template = build_template(ksize, scaled, &moltype); let location = camino::Utf8PathBuf::from(siglist); diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index e51af754..61a0d522 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -100,7 +100,7 @@ pub fn mastiff_manygather>( Ok(query_sig) => { let location = filename.display().to_string(); // if let Some(q) = prepare_query(&query_sig, &selection) { - // query = Some(q); + // query = Some(q); // } // let query = query.expect("Couldn't find a compatible MinHash"); if let Some(query) = prepare_query(&query_sig, &template, &location) { From ff4846920d326733b920bb351dfeae004048c413 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 21 Nov 2023 17:38:28 -0800 Subject: [PATCH 05/47] update deps --- Cargo.lock | 263 ++++------------------------------------------------- Cargo.toml | 2 +- 2 files changed, 17 insertions(+), 248 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 08ef6a0b..4391cffd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,17 +8,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" -[[package]] -name = "aes" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac1f845298e95f983ff1944b728ae08b8cebab80d684f0a832ed0fc74dfa27e2" -dependencies = [ - "cfg-if", - "cipher", - "cpufeatures", -] - [[package]] name = "ahash" version = "0.7.6" @@ -105,12 +94,6 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b7e4c2464d97fe331d41de9d5db0def0a96f4d823b8b32a2efd503578988973" -[[package]] -name = "base64ct" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" - [[package]] name = "bgzip" version = "0.2.2" @@ -172,15 +155,6 @@ dependencies = [ "wyz", ] -[[package]] -name = "block-buffer" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" -dependencies = [ - "generic-array", -] - [[package]] name = "bstr" version = "1.6.2" @@ -313,21 +287,11 @@ dependencies = [ "iana-time-zone", "js-sys", "num-traits", - "time 0.1.45", + "time", "wasm-bindgen", "windows-targets", ] -[[package]] -name = "cipher" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" -dependencies = [ - "crypto-common", - "inout", -] - [[package]] name = "clang-sys" version = "1.6.1" @@ -348,12 +312,6 @@ dependencies = [ "csv", ] -[[package]] -name = "constant_time_eq" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" - [[package]] name = "core-foundation-sys" version = "0.8.4" @@ -369,15 +327,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "cpufeatures" -version = "0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1" -dependencies = [ - "libc", -] - [[package]] name = "crc32fast" version = "1.3.2" @@ -420,16 +369,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "crypto-common" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" -dependencies = [ - "generic-array", - "typenum", -] - [[package]] name = "csv" version = "1.3.0" @@ -451,29 +390,12 @@ dependencies = [ "memchr", ] -[[package]] -name = "deranged" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2696e8a945f658fd14dc3b87242e6b80cd0f36ff04ea560fa39082368847946" - [[package]] name = "difflib" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" -[[package]] -name = "digest" -version = "0.10.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" -dependencies = [ - "block-buffer", - "crypto-common", - "subtle", -] - [[package]] name = "doc-comment" version = "0.3.3" @@ -569,16 +491,6 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" -[[package]] -name = "generic-array" -version = "0.14.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" -dependencies = [ - "typenum", - "version_check", -] - [[package]] name = "getrandom" version = "0.2.10" @@ -637,15 +549,6 @@ version = "0.6.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "12cb882ccb290b8646e554b157ab0b71e64e8d5bef775cd66b6531e52d302669" -[[package]] -name = "hmac" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" -dependencies = [ - "digest", -] - [[package]] name = "humantime" version = "2.1.0" @@ -681,15 +584,6 @@ version = "2.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e186cfbae8084e513daff4240b4797e342f988cecda4fb6c939150f96315fd8" -[[package]] -name = "inout" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5" -dependencies = [ - "generic-array", -] - [[package]] name = "inplace-vec-builder" version = "0.1.1" @@ -855,9 +749,9 @@ checksum = "5486aed0026218e61b8a01d5fbd5a0a134649abb71a0e53b7bc088529dced86e" [[package]] name = "memmap2" -version = "0.7.1" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6" +checksum = "deaba38d7abf1d4cca21cc89e932e542ba2b9258664d2a9ef0e61512039c9375" dependencies = [ "libc", ] @@ -918,7 +812,7 @@ dependencies = [ "flate2", "thiserror", "xz2", - "zstd 0.12.4", + "zstd", ] [[package]] @@ -973,15 +867,6 @@ dependencies = [ "autocfg", ] -[[package]] -name = "numsep" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad5c49c3e12c314efb1f43cba136031b657dcd59ee26936ab2be313c5e97da22" -dependencies = [ - "slicestring", -] - [[package]] name = "once_cell" version = "1.18.0" @@ -1036,29 +921,6 @@ dependencies = [ "windows-targets", ] -[[package]] -name = "password-hash" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7676374caaee8a325c9e7a2ae557f216c5563a171d6997b0ef8a65af35147700" -dependencies = [ - "base64ct", - "rand_core", - "subtle", -] - -[[package]] -name = "pbkdf2" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83a0692ec44e4cf1ef28ca317f14f8f07da2d95ec3fa01f86e4467b725e60917" -dependencies = [ - "digest", - "hmac", - "password-hash", - "sha2", -] - [[package]] name = "peeking_take_while" version = "0.1.2" @@ -1532,28 +1394,6 @@ dependencies = [ "serde", ] -[[package]] -name = "sha1" -version = "0.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] - -[[package]] -name = "sha2" -version = "0.10.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] - [[package]] name = "shlex" version = "1.1.0" @@ -1572,18 +1412,6 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8542b68b8800c3cda649d2c72d688b6907b30f1580043135d61669d4aad1c175" -[[package]] -name = "size" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fed904c7fb2856d868b92464fc8fa597fce366edea1a9cbfaa8cb5fe080bd6d" - -[[package]] -name = "slicestring" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "636b979c5672ac7c2a1120ca0a9a6074cd090dadfec42af6f8a5baea1223d180" - [[package]] name = "smallvec" version = "1.11.0" @@ -1599,9 +1427,9 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.12.0" +source = "git+https://github.com/sourmash-bio/sourmash?branch=lirber/mastiff#cfe28341af5847235bbf853424ee4917995665ee" dependencies = [ "az", - "bytecount", "byteorder", "camino", "cfg-if", @@ -1620,7 +1448,6 @@ dependencies = [ "niffler", "nohash-hasher", "num-iter", - "numsep", "once_cell", "ouroboros", "piz", @@ -1631,7 +1458,6 @@ dependencies = [ "rocksdb", "serde", "serde_json", - "size", "thiserror", "twox-hash", "typed-builder", @@ -1646,12 +1472,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" -[[package]] -name = "subtle" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" - [[package]] name = "syn" version = "1.0.109" @@ -1745,23 +1565,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "time" -version = "0.3.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17f6bb557fd245c28e6411aa56b6403c689ad95061f50e4be16c274e70a17e48" -dependencies = [ - "deranged", - "serde", - "time-core", -] - -[[package]] -name = "time-core" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" - [[package]] name = "tinyvec" version = "1.6.0" @@ -1799,12 +1602,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "typenum" -version = "1.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" - [[package]] name = "unicode-ident" version = "1.0.11" @@ -1873,9 +1670,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.87" +version = "0.2.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" +checksum = "7daec296f25a1bae309c0cd5c29c4b260e510e6d813c286b19eaadf409d40fce" dependencies = [ "cfg-if", "serde", @@ -1885,9 +1682,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.87" +version = "0.2.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" +checksum = "e397f4664c0e4e428e8313a469aaa58310d302159845980fd23b0f22a847f217" dependencies = [ "bumpalo", "log", @@ -1900,9 +1697,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.87" +version = "0.2.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" +checksum = "5961017b3b08ad5f3fe39f1e79877f8ee7c23c5e5fd5eb80de95abc41f1f16b2" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1910,9 +1707,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.87" +version = "0.2.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" +checksum = "c5353b8dab669f5e10f5bd76df26a9360c748f054f862ff5f3f8aae0c7fb3907" dependencies = [ "proc-macro2", "quote", @@ -1923,9 +1720,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.87" +version = "0.2.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" +checksum = "0d046c5d029ba91a1ed14da14dca44b68bf2f124cfbaf741c54151fdb3e0750b" [[package]] name = "web-sys" @@ -2067,27 +1864,9 @@ version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261" dependencies = [ - "aes", "byteorder", - "bzip2", - "constant_time_eq", "crc32fast", "crossbeam-utils", - "flate2", - "hmac", - "pbkdf2", - "sha1", - "time 0.3.28", - "zstd 0.11.2+zstd.1.5.2", -] - -[[package]] -name = "zstd" -version = "0.11.2+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" -dependencies = [ - "zstd-safe 5.0.2+zstd.1.5.2", ] [[package]] @@ -2096,17 +1875,7 @@ version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" dependencies = [ - "zstd-safe 6.0.6", -] - -[[package]] -name = "zstd-safe" -version = "5.0.2+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" -dependencies = [ - "libc", - "zstd-sys", + "zstd-safe", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 45cc21c5..0ce8b0e7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ log = "0.4.14" env_logger = "0.10.1" simple-error = "0.3.0" anyhow = "1.0.75" -zip = "0.6" +zip = { version = "0.6", default-features = false } tempfile = "3.8" needletail = "0.5.1" csv = "1.3.0" From ddf6c8c3baef2ab19d7e44f3639817c6650c8bc2 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Thu, 30 Nov 2023 20:56:37 -0800 Subject: [PATCH 06/47] update to sourmash 0.12.0 --- Cargo.lock | 31 ++++++++++++++++--------------- Cargo.toml | 3 ++- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b5561ba8..bd83ed3b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -630,9 +630,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.64" +version = "0.3.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" +checksum = "cee9c64da59eae3b50095c18d3e74f8b73c0b86d2792824ff01bbce68ba229ca" dependencies = [ "wasm-bindgen", ] @@ -1427,7 +1427,8 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.12.0" -source = "git+https://github.com/sourmash-bio/sourmash?branch=lirber/mastiff#cfe28341af5847235bbf853424ee4917995665ee" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "760c7b049cc70294122c44c4e6d0922ed0e79a8e04f2d739b98a982027a9fd4a" dependencies = [ "az", "byteorder", @@ -1670,9 +1671,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.88" +version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7daec296f25a1bae309c0cd5c29c4b260e510e6d813c286b19eaadf409d40fce" +checksum = "0ed0d4f68a3015cc185aff4db9506a015f4b96f95303897bfa23f846db54064e" dependencies = [ "cfg-if", "serde", @@ -1682,9 +1683,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.88" +version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e397f4664c0e4e428e8313a469aaa58310d302159845980fd23b0f22a847f217" +checksum = "1b56f625e64f3a1084ded111c4d5f477df9f8c92df113852fa5a374dbda78826" dependencies = [ "bumpalo", "log", @@ -1697,9 +1698,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.88" +version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5961017b3b08ad5f3fe39f1e79877f8ee7c23c5e5fd5eb80de95abc41f1f16b2" +checksum = "0162dbf37223cd2afce98f3d0785506dcb8d266223983e4b5b525859e6e182b2" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1707,9 +1708,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.88" +version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5353b8dab669f5e10f5bd76df26a9360c748f054f862ff5f3f8aae0c7fb3907" +checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283" dependencies = [ "proc-macro2", "quote", @@ -1720,15 +1721,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.88" +version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d046c5d029ba91a1ed14da14dca44b68bf2f124cfbaf741c54151fdb3e0750b" +checksum = "7ab9b36309365056cd639da3134bf87fa8f3d86008abf99e612384a6eecd459f" [[package]] name = "web-sys" -version = "0.3.64" +version = "0.3.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" +checksum = "50c24a44ec86bb68fbecd1b3efed7e85ea5621b39b35ef2766b66cd984f8010f" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index 67d2cd2b..1d1d3f47 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,8 @@ rayon = "1.8.0" serde = { version = "1.0.192", features = ["derive"] } #sourmash = { git = "https://github.com/sourmash-bio/sourmash", "rev" = "ff1092f8f366339caa59d7203f623813228f4356" } -sourmash = { git = "https://github.com/sourmash-bio/sourmash", branch = "lirber/mastiff", features = ["branchwater"] } +#sourmash = { git = "https://github.com/sourmash-bio/sourmash", branch = "lirber/mastiff", features = ["branchwater"] } +sourmash = { version = "0.12.0", features = ["branchwater"] } serde_json = "1.0.108" niffler = "2.4.0" log = "0.4.14" From b48ac885ef1e9faf6b6dd44e376997cc5c62114f Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Fri, 1 Dec 2023 15:22:29 -0800 Subject: [PATCH 07/47] fix index --- Cargo.lock | 1 + Cargo.toml | 2 +- src/index.rs | 11 +++++++++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bd83ed3b..9b850190 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1868,6 +1868,7 @@ dependencies = [ "byteorder", "crc32fast", "crossbeam-utils", + "flate2", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 1d1d3f47..c5952f98 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ log = "0.4.14" env_logger = "0.10.1" simple-error = "0.3.0" anyhow = "1.0.75" -zip = { version = "0.6", default-features = false } +zip = { version = "0.6", default-features = false, features = ["deflate"] } tempfile = "3.8" needletail = "0.5.1" csv = "1.3.0" diff --git a/src/index.rs b/src/index.rs index b2c38d63..6768d058 100644 --- a/src/index.rs +++ b/src/index.rs @@ -10,7 +10,7 @@ use sourmash::storage::{FSStorage, InnerStorage, ZipStorage}; use camino::Utf8PathBuf as PathBuf; use std::path::Path; -use crate::utils::{load_sigpaths_from_zip_or_pathlist, ReportType}; +use crate::utils::load_sketchlist_filenames; pub fn index>( siglist: PathBuf, @@ -51,7 +51,14 @@ pub fn index>( Collection::from_zipfile(siglist)? } } else { - let manifest = manifest.ok_or_else(|| "Need a manifest")?; + let manifest = manifest.unwrap_or_else(|| { + let sig_paths: Vec<_> = load_sketchlist_filenames(&siglist) + .unwrap_or_else(|_| panic!("Error loading siglist")) + .into_iter() + .map(|v| PathBuf::from_path_buf(v).unwrap()) + .collect(); + sig_paths.as_slice().into() + }); let storage = FSStorage::builder() .fullpath("".into()) .subdir("".into()) From f1145a16b4f799113eb134497180003482f4c9ac Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 1 Dec 2023 16:26:13 -0800 Subject: [PATCH 08/47] rm reporting line checks not in smash core idx --- src/python/tests/test_index.py | 38 ++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index 2968b356..eeb8f76a 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -89,7 +89,7 @@ def test_index_missing_siglist(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error loading siglist' in captured.err def test_index_bad_siglist(runtmp, capfd): @@ -103,7 +103,7 @@ def test_index_bad_siglist(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert "Error: invalid line in fromfile" in captured.err + assert 'Error loading siglist' in captured.err print(runtmp.last_result.err) @@ -128,21 +128,29 @@ def test_index_bad_siglist_2(runtmp, capfd): def test_index_empty_siglist(runtmp, capfd): + ## TODO: index:: do not write output if no signatures to write? + # OR, warn user? + # test empty siglist file siglist = runtmp.output('db-sigs.txt') output = runtmp.output('out.db') make_file_list(siglist, []) # empty - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'index', siglist, + # with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'index', siglist, '-o', output) captured = capfd.readouterr() + assert os.path.exists(output) # do we want an empty file, or no file? + print(runtmp.last_result.out) + print(runtmp.last_result.err) print(captured.err) - assert "No signatures to index loaded, exiting." in captured.err + # assert "No signatures to index loaded, exiting." in captured.err def test_index_nomatch_sig_in_siglist(runtmp, capfd): + ## TODO: index:: do not write output if no signatures to write? + # test index with a siglist file that has (only) a non-matching ksize sig siglist = runtmp.output('against.txt') db = runtmp.output('db.rdb') @@ -151,13 +159,16 @@ def test_index_nomatch_sig_in_siglist(runtmp, capfd): sig1 = get_test_data('1.fa.k21.sig.gz') make_file_list(siglist, [sig2, sig1]) - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'index', siglist, + # with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'index', siglist, '-o', db) captured = capfd.readouterr() + assert os.path.exists(db) # do we want an empty file, or no file? + print(runtmp.last_result.out) + print(runtmp.last_result.err) print(captured.err) - assert "Couldn't find a compatible MinHash" in captured.err + # assert "Couldn't find a compatible MinHash" in captured.err def test_index_zipfile(runtmp, capfd): @@ -184,7 +195,7 @@ def test_index_zipfile(runtmp, capfd): assert 'index is done' in runtmp.last_result.err captured = capfd.readouterr() print(captured.err) - assert 'Found 3 filepaths' in captured.err + # assert 'Found 3 filepaths' in captured.err def test_index_zipfile_repeated_md5sums(runtmp, capfd): @@ -212,7 +223,7 @@ def test_index_zipfile_repeated_md5sums(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Found 3 filepaths' in captured.err + # assert 'Found 3 filepaths' in captured.err assert 'index is done' in runtmp.last_result.err @@ -243,8 +254,8 @@ def test_index_zipfile_multiparam(runtmp, capfd): assert 'index is done' in runtmp.last_result.err captured = capfd.readouterr() print(captured.err) - assert 'WARNING: skipped 5 index paths - no compatible signatures.' in captured.err - assert 'Found 4 filepaths' in captured.err + # assert 'WARNING: skipped 5 index paths - no compatible signatures.' in captured.err + # assert 'Found 4 filepaths' in captured.err def test_index_zipfile_bad(runtmp, capfd): @@ -266,7 +277,8 @@ def test_index_zipfile_bad(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + assert "Couldn't find End Of Central Directory Record" in captured.err + # assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err def test_index_check(runtmp): From a6785dcc08d4bc7ddb8a46194edcfe7177311d4f Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 7 Dec 2023 13:21:04 -0800 Subject: [PATCH 09/47] use selection instead of template --- src/mastiff_manygather.rs | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 61a0d522..2fcec3cc 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -7,7 +7,7 @@ use sourmash::sketch::Sketch; use std::path::Path; // use sourmash::collection::Collection; -// use sourmash::selection::Selection;A +// use sourmash::selection::Selection; use sourmash::prelude::*; // use sourmash::index::revindex::{prepare_query, RevIndex, RevIndexOps}; // use sourmash::manifest::Manifest; @@ -15,7 +15,7 @@ use sourmash::prelude::*; // use sourmash::signature::{Signature, SigsTrait}; // use sourmash::storage::{FSStorage, InnerStorage, ZipStorage}; -use sourmash::index::revindex::{RevIndex, RevIndexOps}; +use sourmash::index::revindex::{prepare_query, RevIndex, RevIndexOps}; use std::sync::atomic; use std::sync::atomic::AtomicUsize; @@ -24,8 +24,8 @@ use std::fs::File; use std::io::{BufWriter, Write}; use crate::utils::{ - is_revindex_database, load_sigpaths_from_zip_or_pathlist, prepare_query, ReportType, -}; + is_revindex_database, load_sigpaths_from_zip_or_pathlist, ReportType, +}; // prepare_query pub fn mastiff_manygather>( queries_file: P, @@ -96,27 +96,33 @@ pub fn mastiff_manygather>( let mut results = vec![]; // load query signature from path: - match Signature::from_path(filename) { + // todo: add reason text to expect instead of using match arms? + // note: can't keep track of failed paths if we do that? + match Signature::from_path(filename).expect("REASON").swap_remove(0).select(&selection) { Ok(query_sig) => { - let location = filename.display().to_string(); - // if let Some(q) = prepare_query(&query_sig, &selection) { - // query = Some(q); + eprintln!("query_sig selection scaled: {}", selection.scaled()?.to_string()); + let mut query = None; + // if let Some(q) = prepare_query(query_sig, &selection) { + // query = Some(q); // } // let query = query.expect("Couldn't find a compatible MinHash"); - if let Some(query) = prepare_query(&query_sig, &template, &location) { + if let Some(q) = prepare_query(query_sig.clone(), &selection) { + query = Some(q); + let query = query.expect("Couldn't find a compatible MinHash"); + //if let Some(query) = prepare_query(&query_sig, &template, &location) { // let query_size = query.minhash.size() as f64; - let threshold = threshold_bp / query.minhash.scaled() as usize; + let threshold = threshold_bp / query.scaled() as usize; // mastiff gather code let (counter, query_colors, hash_to_color) = - db.prepare_gather_counters(&query.minhash); + db.prepare_gather_counters(&query); let matches = db.gather( counter, query_colors, hash_to_color, threshold, - &query.minhash, + &query, Some(selection.clone()), ); @@ -124,8 +130,8 @@ pub fn mastiff_manygather>( if let Ok(matches) = matches { for match_ in &matches { results.push(( - query.name.clone(), - query.md5sum.clone(), + query_sig.name().clone(), + query.md5sum().clone(), match_.name().clone(), match_.md5().clone(), match_.f_match(), // f_match_query From 21f20caeb75e07a8e99591708a909809f3b5867a Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 22 Jan 2024 17:46:33 -0800 Subject: [PATCH 10/47] rustfmt --- src/mastiff_manygather.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 2fcec3cc..a4293066 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -23,9 +23,7 @@ use std::sync::atomic::AtomicUsize; use std::fs::File; use std::io::{BufWriter, Write}; -use crate::utils::{ - is_revindex_database, load_sigpaths_from_zip_or_pathlist, ReportType, -}; // prepare_query +use crate::utils::{is_revindex_database, load_sigpaths_from_zip_or_pathlist, ReportType}; // prepare_query pub fn mastiff_manygather>( queries_file: P, @@ -98,9 +96,16 @@ pub fn mastiff_manygather>( // load query signature from path: // todo: add reason text to expect instead of using match arms? // note: can't keep track of failed paths if we do that? - match Signature::from_path(filename).expect("REASON").swap_remove(0).select(&selection) { + match Signature::from_path(filename) + .expect("REASON") + .swap_remove(0) + .select(&selection) + { Ok(query_sig) => { - eprintln!("query_sig selection scaled: {}", selection.scaled()?.to_string()); + eprintln!( + "query_sig selection scaled: {}", + selection.scaled()?.to_string() + ); let mut query = None; // if let Some(q) = prepare_query(query_sig, &selection) { // query = Some(q); @@ -109,7 +114,7 @@ pub fn mastiff_manygather>( if let Some(q) = prepare_query(query_sig.clone(), &selection) { query = Some(q); let query = query.expect("Couldn't find a compatible MinHash"); - //if let Some(query) = prepare_query(&query_sig, &template, &location) { + //if let Some(query) = prepare_query(&query_sig, &template, &location) { // let query_size = query.minhash.size() as f64; let threshold = threshold_bp / query.scaled() as usize; From 45b598fbf363c2c9baa98f5a50ba9bb63068c30d Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 22 Jan 2024 18:36:38 -0800 Subject: [PATCH 11/47] fix query file no exist errs --- Cargo.toml | 4 +- src/mastiff_manygather.rs | 122 ++++++++++++++------------- src/python/tests/test_multigather.py | 3 +- 3 files changed, 66 insertions(+), 63 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c0d47a42..dde6aa5d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,9 +12,9 @@ crate-type = ["cdylib"] pyo3 = { version = "0.20.2", features = ["extension-module", "anyhow"] } rayon = "1.8.1" serde = { version = "1.0.195", features = ["derive"] } -#sourmash = { git = "https://github.com/sourmash-bio/sourmash", "rev" = "b780164a2cc6db1cf66e58ca5ea55b83b563921e" } +sourmash = { git = "https://github.com/sourmash-bio/sourmash", branch = "select-downsample", features = ["branchwater"] } #sourmash = { git = "https://github.com/sourmash-bio/sourmash", branch = "lirber/mastiff", features = ["branchwater"] } -sourmash = { version = "0.12.0", features = ["branchwater"] } +#sourmash = { version = "0.12.0", features = ["branchwater"] } serde_json = "1.0.111" niffler = "2.4.0" log = "0.4.14" diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index a4293066..6ef89adc 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -2,6 +2,7 @@ use anyhow::Result; use rayon::prelude::*; +use sourmash::ffi::signature; use sourmash::signature::Signature; use sourmash::sketch::Sketch; use std::path::Path; @@ -94,71 +95,74 @@ pub fn mastiff_manygather>( let mut results = vec![]; // load query signature from path: - // todo: add reason text to expect instead of using match arms? - // note: can't keep track of failed paths if we do that? - match Signature::from_path(filename) - .expect("REASON") - .swap_remove(0) - .select(&selection) - { - Ok(query_sig) => { - eprintln!( - "query_sig selection scaled: {}", - selection.scaled()?.to_string() - ); - let mut query = None; - // if let Some(q) = prepare_query(query_sig, &selection) { - // query = Some(q); - // } - // let query = query.expect("Couldn't find a compatible MinHash"); - if let Some(q) = prepare_query(query_sig.clone(), &selection) { - query = Some(q); - let query = query.expect("Couldn't find a compatible MinHash"); - //if let Some(query) = prepare_query(&query_sig, &template, &location) { - // let query_size = query.minhash.size() as f64; - let threshold = threshold_bp / query.scaled() as usize; - - // mastiff gather code - let (counter, query_colors, hash_to_color) = - db.prepare_gather_counters(&query); - - let matches = db.gather( - counter, - query_colors, - hash_to_color, - threshold, - &query, - Some(selection.clone()), - ); - - // extract matches from Result - if let Ok(matches) = matches { - for match_ in &matches { - results.push(( - query_sig.name().clone(), - query.md5sum().clone(), - match_.name().clone(), - match_.md5().clone(), - match_.f_match(), // f_match_query - match_.intersect_bp(), - )); // intersect_bp + match Signature::from_path(filename) { + Ok(mut signature) => { + match signature.swap_remove(0).select(&selection) { + Ok(query_sig) => { + eprintln!( + "query_sig selection scaled: {}", + selection.scaled()?.to_string() + ); + let mut query = None; + + if let Some(q) = prepare_query(query_sig.clone(), &selection) { + query = Some(q); + let query = query.expect("Couldn't find a compatible MinHash"); + + let threshold = threshold_bp / query.scaled() as usize; + + // mastiff gather code + let (counter, query_colors, hash_to_color) = + db.prepare_gather_counters(&query); + + let matches = db.gather( + counter, + query_colors, + hash_to_color, + threshold, + &query, + Some(selection.clone()), + ); + + // extract matches from Result + if let Ok(matches) = matches { + for match_ in &matches { + results.push(( + query_sig.name().clone(), + query.md5sum().clone(), + match_.name().clone(), + match_.md5().clone(), + match_.f_match(), // f_match_query + match_.intersect_bp(), + )); // intersect_bp + } + } else { + eprintln!("Error gathering matches: {:?}", matches.err()); + } + } else { + if !queryfile_name.ends_with(".zip") { + eprintln!( + "WARNING: no compatible sketches in path '{}'", + filename.display() + ); + } + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); + } + if results.is_empty() { + None + } else { + Some(results) } - } else { - eprintln!("Error gathering matches: {:?}", matches.err()); } - } else { - if !queryfile_name.ends_with(".zip") { + Err(err) => { + let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + eprintln!("Error in processing: {}", err); eprintln!( - "WARNING: no compatible sketches in path '{}'", + "WARNING: could not process item from path '{}'", filename.display() ); + None } - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); - } - if results.is_empty() { - None - } else { - Some(results) } } Err(err) => { diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_multigather.py index a00d2b62..646e9309 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_multigather.py @@ -244,7 +244,7 @@ def test_bad_query_2(runtmp, capfd, indexed): @pytest.mark.parametrize('indexed', [False, True]) def test_missing_query(runtmp, capfd, indexed): - # test missingfile in querylist + # test missing query query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -263,7 +263,6 @@ def test_missing_query(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - assert "WARNING: could not load sketches from path 'no-exist'" in captured.err assert "WARNING: 1 query paths failed to load. See error messages above." From 34ed1bbe14a941cc02c6514b629d47318a8a44d1 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 23 Jan 2024 15:14:07 -0800 Subject: [PATCH 12/47] update mastiff_manygather --- Cargo.toml | 4 +- src/mastiff_manygather.rs | 115 +++++++++++++++++--------------------- 2 files changed, 54 insertions(+), 65 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index dde6aa5d..490e4c56 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,8 +12,8 @@ crate-type = ["cdylib"] pyo3 = { version = "0.20.2", features = ["extension-module", "anyhow"] } rayon = "1.8.1" serde = { version = "1.0.195", features = ["derive"] } -sourmash = { git = "https://github.com/sourmash-bio/sourmash", branch = "select-downsample", features = ["branchwater"] } -#sourmash = { git = "https://github.com/sourmash-bio/sourmash", branch = "lirber/mastiff", features = ["branchwater"] } +sourmash = { git = "https://github.com/sourmash-bio/sourmash", branch="select-downsample", features = ["branchwater"] } +#sourmash = { git = "https://github.com/sourmash-bio/sourmash", rev= "94b88cc314f781342721addc5ed35c531732a9b6", features = ["branchwater"] } #sourmash = { version = "0.12.0", features = ["branchwater"] } serde_json = "1.0.111" niffler = "2.4.0" diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 6ef89adc..1e50add6 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -5,6 +5,7 @@ use rayon::prelude::*; use sourmash::ffi::signature; use sourmash::signature::Signature; use sourmash::sketch::Sketch; +use sourmash::sketch::minhash::KmerMinHash; use std::path::Path; // use sourmash::collection::Collection; @@ -84,36 +85,25 @@ pub fn mastiff_manygather>( let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - let send = query_paths - .par_iter() - .filter_map(|filename| { - let i = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst); - if i % 1000 == 0 { - eprintln!("Processed {} search sigs", i); - } - - let mut results = vec![]; - - // load query signature from path: - match Signature::from_path(filename) { - Ok(mut signature) => { - match signature.swap_remove(0).select(&selection) { - Ok(query_sig) => { - eprintln!( - "query_sig selection scaled: {}", - selection.scaled()?.to_string() - ); - let mut query = None; - - if let Some(q) = prepare_query(query_sig.clone(), &selection) { - query = Some(q); - let query = query.expect("Couldn't find a compatible MinHash"); - let threshold = threshold_bp / query.scaled() as usize; - - // mastiff gather code - let (counter, query_colors, hash_to_color) = - db.prepare_gather_counters(&query); + let send = query_paths + .par_iter() + .filter_map(|filename| { + // ... existing setup code ... + let threshold = threshold_bp / selection.scaled()? as usize; + + match Signature::from_path(filename) { + Ok(mut signatures) if !signatures.is_empty() => { + match signatures.swap_remove(0).select(&selection) { + Ok(query_sig) => { + let mut results = vec![]; + let mut found_compatible_sketch = false; + for sketch in query_sig.iter() { + if let Sketch::MinHash(query) = sketch { + found_compatible_sketch = true; + // eprintln!("query-size: {}", sketch.size()); + // Gather! + let (counter, query_colors, hash_to_color) = db.prepare_gather_counters(&query); let matches = db.gather( counter, @@ -123,8 +113,7 @@ pub fn mastiff_manygather>( &query, Some(selection.clone()), ); - - // extract matches from Result + // extract results if let Ok(matches) = matches { for match_ in &matches { results.push(( @@ -139,45 +128,45 @@ pub fn mastiff_manygather>( } else { eprintln!("Error gathering matches: {:?}", matches.err()); } - } else { - if !queryfile_name.ends_with(".zip") { - eprintln!( - "WARNING: no compatible sketches in path '{}'", - filename.display() - ); - } - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - if results.is_empty() { - None - } else { - Some(results) + } + if !found_compatible_sketch { + if !queryfile_name.ends_with(".zip") { + eprintln!( + "WARNING: no compatible sketches in path '{}'", + filename.display() + ); } + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - Err(err) => { - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - eprintln!("Error in processing: {}", err); - eprintln!( - "WARNING: could not process item from path '{}'", - filename.display() - ); + + if results.is_empty() { None + } else { + Some(results) } } + Err(err) => { + eprintln!("Error selecting sketches: {}", err); + let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + None + } } - Err(err) => { - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - eprintln!("Sketch loading error: {}", err); - eprintln!( - "WARNING: could not load sketches from path '{}'", - filename.display() - ); - None - } } - }) - .flatten() - .try_for_each_with(send, |s, m| s.send(m)); + Ok(_) => { + eprintln!("No signatures found in '{}'", filename.display()); + let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + None + } + Err(err) => { + eprintln!("WARNING: could not load sketches from path '{}': {}", filename.display(), err); + let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + None + } + } + }) + .flatten() + .try_for_each_with(send, |s, m| s.send(m)); // do some cleanup and error handling - if let Err(e) = send { @@ -203,7 +192,7 @@ pub fn mastiff_manygather>( } if failed_paths > 0 { eprintln!( - "WARNING: {} signature paths failed to load. See error messages above.", + "WARNING: {} query paths failed to load. See error messages above.", failed_paths ); } From 243d1067178c37763029263cd16084ae9e569199 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 23 Jan 2024 15:16:10 -0800 Subject: [PATCH 13/47] rustfmt --- src/mastiff_manygather.rs | 148 +++++++++++++++++++------------------- 1 file changed, 76 insertions(+), 72 deletions(-) diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 1e50add6..b581d73e 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -4,8 +4,8 @@ use rayon::prelude::*; use sourmash::ffi::signature; use sourmash::signature::Signature; -use sourmash::sketch::Sketch; use sourmash::sketch::minhash::KmerMinHash; +use sourmash::sketch::Sketch; use std::path::Path; // use sourmash::collection::Collection; @@ -85,88 +85,92 @@ pub fn mastiff_manygather>( let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - let send = query_paths - .par_iter() - .filter_map(|filename| { - // ... existing setup code ... - let threshold = threshold_bp / selection.scaled()? as usize; - - match Signature::from_path(filename) { - Ok(mut signatures) if !signatures.is_empty() => { - match signatures.swap_remove(0).select(&selection) { - Ok(query_sig) => { - let mut results = vec![]; - let mut found_compatible_sketch = false; - for sketch in query_sig.iter() { - if let Sketch::MinHash(query) = sketch { - found_compatible_sketch = true; - // eprintln!("query-size: {}", sketch.size()); - // Gather! - let (counter, query_colors, hash_to_color) = db.prepare_gather_counters(&query); - - let matches = db.gather( - counter, - query_colors, - hash_to_color, - threshold, - &query, - Some(selection.clone()), - ); - // extract results - if let Ok(matches) = matches { - for match_ in &matches { - results.push(( - query_sig.name().clone(), - query.md5sum().clone(), - match_.name().clone(), - match_.md5().clone(), - match_.f_match(), // f_match_query - match_.intersect_bp(), - )); // intersect_bp + .par_iter() + .filter_map(|filename| { + // ... existing setup code ... + let threshold = threshold_bp / selection.scaled()? as usize; + + match Signature::from_path(filename) { + Ok(mut signatures) if !signatures.is_empty() => { + match signatures.swap_remove(0).select(&selection) { + Ok(query_sig) => { + let mut results = vec![]; + let mut found_compatible_sketch = false; + for sketch in query_sig.iter() { + if let Sketch::MinHash(query) = sketch { + found_compatible_sketch = true; + // eprintln!("query-size: {}", sketch.size()); + // Gather! + let (counter, query_colors, hash_to_color) = + db.prepare_gather_counters(&query); + + let matches = db.gather( + counter, + query_colors, + hash_to_color, + threshold, + &query, + Some(selection.clone()), + ); + // extract results + if let Ok(matches) = matches { + for match_ in &matches { + results.push(( + query_sig.name().clone(), + query.md5sum().clone(), + match_.name().clone(), + match_.md5().clone(), + match_.f_match(), // f_match_query + match_.intersect_bp(), + )); // intersect_bp + } + } else { + eprintln!("Error gathering matches: {:?}", matches.err()); } - } else { - eprintln!("Error gathering matches: {:?}", matches.err()); } } - } - if !found_compatible_sketch { - if !queryfile_name.ends_with(".zip") { - eprintln!( - "WARNING: no compatible sketches in path '{}'", - filename.display() - ); + if !found_compatible_sketch { + if !queryfile_name.ends_with(".zip") { + eprintln!( + "WARNING: no compatible sketches in path '{}'", + filename.display() + ); + } + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); - } - if results.is_empty() { + if results.is_empty() { + None + } else { + Some(results) + } + } + Err(err) => { + eprintln!("Error selecting sketches: {}", err); + let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); None - } else { - Some(results) } } - Err(err) => { - eprintln!("Error selecting sketches: {}", err); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - None - } + } + Ok(_) => { + eprintln!("No signatures found in '{}'", filename.display()); + let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + None + } + Err(err) => { + eprintln!( + "WARNING: could not load sketches from path '{}': {}", + filename.display(), + err + ); + let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + None } } - Ok(_) => { - eprintln!("No signatures found in '{}'", filename.display()); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - None - } - Err(err) => { - eprintln!("WARNING: could not load sketches from path '{}': {}", filename.display(), err); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - None - } - } - }) - .flatten() - .try_for_each_with(send, |s, m| s.send(m)); + }) + .flatten() + .try_for_each_with(send, |s, m| s.send(m)); // do some cleanup and error handling - if let Err(e) = send { From 87219aa60e06dc8f7f4919eabd10f91c198de17f Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 23 Jan 2024 15:18:20 -0800 Subject: [PATCH 14/47] add cargo lock --- Cargo.lock | 248 ++++++++++++++++++++++++----------------------------- 1 file changed, 113 insertions(+), 135 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c457c851..e593bb3c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,12 +2,6 @@ # It is not intended for manual editing. version = 3 -[[package]] -name = "Inflector" -version = "0.11.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe438c63458706e03479442743baae6c88256498e6431708f6dfc520a26515d3" - [[package]] name = "adler" version = "1.0.2" @@ -110,6 +104,12 @@ dependencies = [ "thiserror", ] +[[package]] +name = "binary-merge" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597bb81c80a54b6a4381b23faba8d7774b144c94cbd1d6fe3f1329bd776554ab" + [[package]] name = "bindgen" version = "0.65.1" @@ -222,6 +222,12 @@ version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +[[package]] +name = "bytes" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" + [[package]] name = "bzip2" version = "0.4.4" @@ -248,6 +254,9 @@ name = "camino" version = "1.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c" +dependencies = [ + "serde", +] [[package]] name = "cc" @@ -405,6 +414,18 @@ version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" +[[package]] +name = "enum_dispatch" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f33313078bb8d4d05a2733a94ac4c2d8a0df9a2b84424ebf4f33bfc224a890e" +dependencies = [ + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.48", +] + [[package]] name = "env_logger" version = "0.10.2" @@ -459,37 +480,12 @@ dependencies = [ "num-traits", ] -[[package]] -name = "flume" -version = "0.10.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1657b4441c3403d9f7b3409e47575237dac27b1b5726df654a6ecbf92f0f7577" -dependencies = [ - "futures-core", - "futures-sink", - "nanorand", - "pin-project", - "spin", -] - [[package]] name = "funty" version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" -[[package]] -name = "futures-core" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" - -[[package]] -name = "futures-sink" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" - [[package]] name = "getrandom" version = "0.2.10" @@ -544,9 +540,12 @@ checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" [[package]] name = "histogram" -version = "0.6.9" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12cb882ccb290b8646e554b157ab0b71e64e8d5bef775cd66b6531e52d302669" +checksum = "de0f59c8ab5f8d1f1dd481174172ce418e2e306d665cdd8057c0bd457c447159" +dependencies = [ + "thiserror", +] [[package]] name = "humantime" @@ -583,6 +582,15 @@ version = "2.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e186cfbae8084e513daff4240b4797e342f988cecda4fb6c939150f96315fd8" +[[package]] +name = "inplace-vec-builder" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf64c2edc8226891a71f127587a2861b132d2b942310843814d5001d99a1d307" +dependencies = [ + "smallvec", +] + [[package]] name = "is-terminal" version = "0.4.9" @@ -594,6 +602,15 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "itertools" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25db6b064527c5d482d0423354fcd07a89a2dfe07b67892e62411946db7f07b0" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.9" @@ -730,9 +747,9 @@ checksum = "5486aed0026218e61b8a01d5fbd5a0a134649abb71a0e53b7bc088529dced86e" [[package]] name = "memmap2" -version = "0.5.10" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" +checksum = "45fd3a57831bf88bc63f8cebc0cf956116276e97fef3966103e96416209f7c92" dependencies = [ "libc", ] @@ -767,15 +784,6 @@ version = "0.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2983372caf4480544083767bf2d27defafe32af49ab4df3a0b7fc90793a3664" -[[package]] -name = "nanorand" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3" -dependencies = [ - "getrandom", -] - [[package]] name = "needletail" version = "0.5.1" @@ -857,15 +865,6 @@ dependencies = [ "autocfg", ] -[[package]] -name = "numsep" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad5c49c3e12c314efb1f43cba136031b657dcd59ee26936ab2be313c5e97da22" -dependencies = [ - "slicestring", -] - [[package]] name = "once_cell" version = "1.18.0" @@ -874,25 +873,27 @@ checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "ouroboros" -version = "0.15.6" +version = "0.18.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1358bd1558bd2a083fed428ffeda486fbfb323e698cdda7794259d592ca72db" +checksum = "97b7be5a8a3462b752f4be3ff2b2bf2f7f1d00834902e46be2a4d68b87b0573c" dependencies = [ "aliasable", "ouroboros_macro", + "static_assertions", ] [[package]] name = "ouroboros_macro" -version = "0.15.6" +version = "0.18.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f7d21ccd03305a674437ee1248f3ab5d4b1db095cf1caf49f1713ddf61956b7" +checksum = "b645dcde5f119c2c454a92d0dfa271a2a3b205da92e4292a68ead4bdbfde1f33" dependencies = [ - "Inflector", - "proc-macro-error", + "heck", + "itertools", "proc-macro2", + "proc-macro2-diagnostics", "quote", - "syn 1.0.109", + "syn 2.0.48", ] [[package]] @@ -924,39 +925,20 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" -[[package]] -name = "pin-project" -version = "1.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fda4ed1c6c173e3fc7a83629421152e01d7b1f9b7f65fb301e490e8cfc656422" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.48", -] - [[package]] name = "piz" -version = "0.4.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58c75d1c00e6d407e283cc66d9d4fd0985ef1703c761520845b93c4f981bfb65" +checksum = "898b071c1938a2c92b95c18708cbf38f2566a01f0ab9dd7bdf4329987e5c2e17" dependencies = [ + "camino", "chrono", "codepage-437", "crc32fast", "flate2", "log", + "memchr", "thiserror", - "twoway", ] [[package]] @@ -1053,6 +1035,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "proc-macro2-diagnostics" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", + "version_check", + "yansi", +] + [[package]] name = "ptr_meta" version = "0.1.4" @@ -1264,12 +1259,13 @@ checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086" [[package]] name = "rkyv" -version = "0.7.42" +version = "0.7.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0200c8230b013893c0b2d6213d6ec64ed2b9be2e0e016682b7224ff82cff5c58" +checksum = "527a97cdfef66f65998b5f3b637c26f5a5ec09cc52a3f9932313ac645f4190f5" dependencies = [ "bitvec", "bytecheck", + "bytes", "hashbrown", "ptr_meta", "rend", @@ -1281,9 +1277,9 @@ dependencies = [ [[package]] name = "rkyv_derive" -version = "0.7.42" +version = "0.7.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2e06b915b5c230a17d7a736d1e2e63ee753c256a8614ef3f5147b13a4f5541d" +checksum = "b5c462a1328c8e67e4d6dbad1eb0355dd43e8ab432c6e227a43657f16ade5033" dependencies = [ "proc-macro2", "quote", @@ -1292,9 +1288,9 @@ dependencies = [ [[package]] name = "roaring" -version = "0.9.0" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd539cab4e32019956fe7e0cf160bb6d4802f4be2b52c4253d76d3bb0f85a5f7" +checksum = "6106b5cf8587f5834158895e9715a3c6c9716c8aefab57f1f7680917191c7873" dependencies = [ "bytemuck", "byteorder", @@ -1403,18 +1399,6 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8542b68b8800c3cda649d2c72d688b6907b30f1580043135d61669d4aad1c175" -[[package]] -name = "size" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fed904c7fb2856d868b92464fc8fa597fce366edea1a9cbfaa8cb5fe080bd6d" - -[[package]] -name = "slicestring" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "636b979c5672ac7c2a1120ca0a9a6074cd090dadfec42af6f8a5baea1223d180" - [[package]] name = "smallvec" version = "1.11.0" @@ -1430,15 +1414,18 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.12.0" -source = "git+https://github.com/sourmash-bio/sourmash?rev=ff1092f8f366339caa59d7203f623813228f4356#ff1092f8f366339caa59d7203f623813228f4356" +source = "git+https://github.com/sourmash-bio/sourmash?branch=select-downsample#efd1ee420dbf872462c3bc56defd023a6a6234e5" dependencies = [ "az", - "bytecount", "byteorder", + "camino", "cfg-if", + "chrono", "counter", + "csv", + "enum_dispatch", "fixedbitset", - "flume", + "getrandom", "getset", "histogram", "log", @@ -1448,7 +1435,6 @@ dependencies = [ "niffler", "nohash-hasher", "num-iter", - "numsep", "once_cell", "ouroboros", "piz", @@ -1459,7 +1445,6 @@ dependencies = [ "rocksdb", "serde", "serde_json", - "size", "thiserror", "twox-hash", "typed-builder", @@ -1492,15 +1477,6 @@ dependencies = [ "zip", ] -[[package]] -name = "spin" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" -dependencies = [ - "lock_api", -] - [[package]] name = "static_assertions" version = "1.1.0" @@ -1615,16 +1591,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" -[[package]] -name = "twoway" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c57ffb460d7c24cd6eda43694110189030a3d1dfe418416d9468fd1c1d290b47" -dependencies = [ - "memchr", - "unchecked-index", -] - [[package]] name = "twox-hash" version = "1.6.3" @@ -1638,20 +1604,23 @@ dependencies = [ [[package]] name = "typed-builder" -version = "0.10.0" +version = "0.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89851716b67b937e393b3daa8423e67ddfc4bbbf1654bcf05488e95e0828db0c" +checksum = "444d8748011b93cb168770e8092458cb0f8854f931ff82fdf6ddfbd72a9c933e" dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", + "typed-builder-macro", ] [[package]] -name = "unchecked-index" -version = "0.2.2" +name = "typed-builder-macro" +version = "0.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eeba86d422ce181a719445e51872fa30f1f7413b62becb52e95ec91aa262d85c" +checksum = "563b3b88238ec95680aef36bdece66896eaa7ce3c0f1b4f39d38fb2435261352" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", +] [[package]] name = "unicode-ident" @@ -1679,10 +1648,13 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "vec-collections" -version = "0.3.6" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f2390c4dc8ae8640c57d067b1a3d40bc05c124cc6bc7394d761b53435d41b76" +checksum = "3c9965c8f2ffed1dbcd16cafe18a009642f540fa22661c6cfd6309ddb02e4982" dependencies = [ + "binary-merge", + "inplace-vec-builder", + "lazy_static", "num-traits", "serde", "smallvec", @@ -1972,6 +1944,12 @@ dependencies = [ "lzma-sys", ] +[[package]] +name = "yansi" +version = "1.0.0-rc.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1367295b8f788d371ce2dbc842c7b709c73ee1364d30351dd300ec2203b12377" + [[package]] name = "zip" version = "0.6.6" From 2fcf684d89f109b6763697d7f6556f36d2cce695 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 23 Jan 2024 20:47:53 -0800 Subject: [PATCH 15/47] switch to commit in latest br --- Cargo.lock | 298 +++++++++++++++++++++++------------------------------ Cargo.toml | 3 +- 2 files changed, 132 insertions(+), 169 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e593bb3c..a9e84652 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10,9 +10,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" -version = "0.7.6" +version = "0.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" +checksum = "5a824f2aa7e75a0c98c5a504fceb80649e9c35265d44525b5f94de4771a395cd" dependencies = [ "getrandom", "once_cell", @@ -21,9 +21,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.0.5" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c378d78423fdad8089616f827526ee33c19f2fddbd5de1629152c9593ba4783" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" dependencies = [ "memchr", ] @@ -51,9 +51,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.2" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15c4c2c83f81532e5845a733998b6971faca23490340a418e9b72a3ec9de12ea" +checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87" [[package]] name = "anyhow" @@ -139,9 +139,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.4.0" +version = "2.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635" +checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" [[package]] name = "bitvec" @@ -157,9 +157,9 @@ dependencies = [ [[package]] name = "bstr" -version = "1.6.2" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a" +checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc" dependencies = [ "memchr", "regex-automata", @@ -178,9 +178,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.13.0" +version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" +checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" [[package]] name = "bytecheck" @@ -206,21 +206,21 @@ dependencies = [ [[package]] name = "bytecount" -version = "0.6.3" +version = "0.6.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" +checksum = "e1e5f035d16fc623ae5f74981db80a439803888314e3a555fd6f04acd51a3205" [[package]] name = "bytemuck" -version = "1.13.1" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17febce684fd15d89027105661fec94afb475cb995fbc59d2865198446ba2eea" +checksum = "374d28ec25809ee0e23827c2ab573d729e293f281dfe393500e7ad618baa61c6" [[package]] name = "byteorder" -version = "1.4.3" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" @@ -285,24 +285,23 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.28" +version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95ed24df0632f708f5f6d8082675bef2596f7084dee3dd55f632290bf35bfe0f" +checksum = "41daef31d7a747c5c847246f36de49ced6f7403b4cdabc807a97b5cc184cda7a" dependencies = [ "android-tzdata", "iana-time-zone", "js-sys", "num-traits", - "time", "wasm-bindgen", - "windows-targets 0.48.5", + "windows-targets 0.52.0", ] [[package]] name = "clang-sys" -version = "1.6.1" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f" +checksum = "67523a3b4be3ce1989d607a828d036249522dd9c1c8de7f4dd2dae43a37369d1" dependencies = [ "glob", "libc", @@ -320,9 +319,9 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.4" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" [[package]] name = "counter" @@ -344,36 +343,28 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" dependencies = [ - "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" -version = "0.9.15" +version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ - "autocfg", - "cfg-if", "crossbeam-utils", - "memoffset", - "scopeguard", ] [[package]] name = "crossbeam-utils" -version = "0.8.16" +version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" -dependencies = [ - "cfg-if", -] +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] name = "csv" @@ -451,9 +442,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.0.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" [[package]] name = "fixedbitset" @@ -463,9 +454,9 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "flate2" -version = "1.0.27" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6c98ee8095e9d1dcbf2fcc6d95acccb90d1c81db1e44725c6a984b1dbdfb010" +checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" dependencies = [ "crc32fast", "miniz_oxide", @@ -488,14 +479,14 @@ checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" [[package]] name = "getrandom" -version = "0.2.10" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" dependencies = [ "cfg-if", "js-sys", "libc", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi", "wasm-bindgen", ] @@ -534,15 +525,15 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "hermit-abi" -version = "0.3.2" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" +checksum = "5d3d0e0f38255e7fa3cf31335b3a56f05febd18025f4db5ef7a0cfb4f8da651f" [[package]] name = "histogram" -version = "0.8.4" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de0f59c8ab5f8d1f1dd481174172ce418e2e306d665cdd8057c0bd457c447159" +checksum = "e5ee9487899388cf1a1155759c39e3c156c5d198b6da1734053954a6e40e6d4d" dependencies = [ "thiserror", ] @@ -555,16 +546,16 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "iana-time-zone" -version = "0.1.57" +version = "0.1.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613" +checksum = "b6a67363e2aa4443928ce15e57ebae94fd8949958fd1223c4cfc0cd473ad7539" dependencies = [ "android_system_properties", "core-foundation-sys", "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "windows", + "windows-core", ] [[package]] @@ -593,13 +584,13 @@ dependencies = [ [[package]] name = "is-terminal" -version = "0.4.9" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" +checksum = "0bad00257d07be169d870ab665980b06cdb366d792ad690bf2e76876dc503455" dependencies = [ "hermit-abi", "rustix", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] @@ -613,24 +604,24 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.9" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" +checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" [[package]] name = "jobserver" -version = "0.1.26" +version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2" +checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d" dependencies = [ "libc", ] [[package]] name = "js-sys" -version = "0.3.66" +version = "0.3.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cee9c64da59eae3b50095c18d3e74f8b73c0b86d2792824ff01bbce68ba229ca" +checksum = "9a1d36f1235bc969acba30b7f5990b864423a6068a10f7c90ae8f0112e3a59d1" dependencies = [ "wasm-bindgen", ] @@ -649,18 +640,18 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.151" +version = "0.2.152" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4" +checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" [[package]] name = "libloading" -version = "0.7.4" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +checksum = "c571b676ddfc9a8c12f1f3d3085a7b163966a8fd8098a90640953ce5f6170161" dependencies = [ "cfg-if", - "winapi", + "windows-sys 0.48.0", ] [[package]] @@ -681,9 +672,9 @@ dependencies = [ [[package]] name = "libz-sys" -version = "1.1.12" +version = "1.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d97137b25e321a73eef1418d1d5d2eda4d77e12813f8e6dead84bc52c5870a7b" +checksum = "295c17e837573c8c821dbaeb3cceb3d745ad082f7572191409e69cbc1b3fd050" dependencies = [ "cc", "pkg-config", @@ -692,15 +683,15 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" [[package]] name = "lock_api" -version = "0.4.10" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" +checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" dependencies = [ "autocfg", "scopeguard", @@ -741,9 +732,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "memchr" -version = "2.6.2" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5486aed0026218e61b8a01d5fbd5a0a134649abb71a0e53b7bc088529dced86e" +checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" [[package]] name = "memmap2" @@ -858,18 +849,18 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" +checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" dependencies = [ "autocfg", ] [[package]] name = "once_cell" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "ouroboros" @@ -908,13 +899,13 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.8" +version = "0.9.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" +checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.3.5", + "redox_syscall", "smallvec", "windows-targets 0.48.5", ] @@ -943,9 +934,9 @@ dependencies = [ [[package]] name = "pkg-config" -version = "0.3.27" +version = "0.3.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" +checksum = "2900ede94e305130c13ddd391e0ab7cbaeb783945ae07a279c268cb05109c6cb" [[package]] name = "ppv-lite86" @@ -985,9 +976,9 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.12" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c64d9ba0963cdcea2e1b2230fbae2bab30eb25a174be395c41e764bfb65dd62" +checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5" dependencies = [ "proc-macro2", "syn 2.0.48", @@ -1028,9 +1019,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.76" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95fc56cda0b5c3325f5fbbd7ff9fda9e02bb00bb3dac51252d2f1bfa1cb8cc8c" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] @@ -1195,15 +1186,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "redox_syscall" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "redox_syscall" version = "0.4.1" @@ -1215,9 +1197,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.9.4" +version = "1.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12de2eff854e5fa4b1295edd650e227e9d8fb0c9e90b12e7f36d6a6811791a29" +checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" dependencies = [ "aho-corasick", "memchr", @@ -1227,9 +1209,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.3.7" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49530408a136e16e5b486e883fbb6ba058e8e4e8ae6621a77b048b314336e629" +checksum = "3b7fa1134405e2ec9353fd416b17f8dacd46c473d7d3fd1cf202706a14eb792a" dependencies = [ "aho-corasick", "memchr", @@ -1238,15 +1220,15 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.7.5" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" [[package]] name = "rend" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "581008d2099240d37fb08d77ad713bcaec2c4d89d50b5b21a8bb1996bbab68ab" +checksum = "a2571463863a6bd50c32f94402933f03457a3fbaf697a707c5be741e459f08fd" dependencies = [ "bytecheck", ] @@ -1315,11 +1297,11 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustix" -version = "0.38.28" +version = "0.38.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" +checksum = "322394588aaf33c24007e8bb3238ee3e4c5c09c084ab32bc73890b99ff326bca" dependencies = [ - "bitflags 2.4.0", + "bitflags 2.4.2", "errno", "libc", "linux-raw-sys", @@ -1328,9 +1310,9 @@ dependencies = [ [[package]] name = "ryu" -version = "1.0.15" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" +checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" [[package]] name = "safemem" @@ -1383,9 +1365,9 @@ dependencies = [ [[package]] name = "shlex" -version = "1.1.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "simdutf8" @@ -1401,9 +1383,9 @@ checksum = "8542b68b8800c3cda649d2c72d688b6907b30f1580043135d61669d4aad1c175" [[package]] name = "smallvec" -version = "1.11.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" +checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" [[package]] name = "sorted-iter" @@ -1414,7 +1396,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.12.0" -source = "git+https://github.com/sourmash-bio/sourmash?branch=select-downsample#efd1ee420dbf872462c3bc56defd023a6a6234e5" +source = "git+https://github.com/sourmash-bio/sourmash?rev=94b88cc314f781342721addc5ed35c531732a9b6#94b88cc314f781342721addc5ed35c531732a9b6" dependencies = [ "az", "byteorder", @@ -1513,9 +1495,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "target-lexicon" -version = "0.12.11" +version = "0.12.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d0e916b1148c8e263850e1ebcbd046f333e0683c724876bb0da63ea4373dc8a" +checksum = "69758bda2e78f098e4ccb393021a0963bb3442eac05f135c30f61b7370bbafae" [[package]] name = "tempfile" @@ -1525,16 +1507,16 @@ checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" dependencies = [ "cfg-if", "fastrand", - "redox_syscall 0.4.1", + "redox_syscall", "rustix", "windows-sys 0.52.0", ] [[package]] name = "termcolor" -version = "1.2.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" +checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" dependencies = [ "winapi-util", ] @@ -1547,35 +1529,24 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76" [[package]] name = "thiserror" -version = "1.0.47" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f" +checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.47" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b" +checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471" dependencies = [ "proc-macro2", "quote", "syn 2.0.48", ] -[[package]] -name = "time" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" -dependencies = [ - "libc", - "wasi 0.10.0+wasi-snapshot-preview1", - "winapi", -] - [[package]] name = "tinyvec" version = "1.6.0" @@ -1624,9 +1595,9 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.11" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unindent" @@ -1636,9 +1607,9 @@ checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" [[package]] name = "uuid" -version = "1.4.1" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79daa5ed5740825c40b389c5e50312b9c86df53fccd33f281df655642b43869d" +checksum = "f00cc9702ca12d3c81455259621e676d0f7251cec66a21e98fe2e9a37db93b2a" [[package]] name = "vcpkg" @@ -1676,12 +1647,6 @@ dependencies = [ "libc", ] -[[package]] -name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -1690,9 +1655,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.89" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ed0d4f68a3015cc185aff4db9506a015f4b96f95303897bfa23f846db54064e" +checksum = "b1223296a201415c7fad14792dbefaace9bd52b62d33453ade1c5b5f07555406" dependencies = [ "cfg-if", "serde", @@ -1702,9 +1667,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.89" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b56f625e64f3a1084ded111c4d5f477df9f8c92df113852fa5a374dbda78826" +checksum = "fcdc935b63408d58a32f8cc9738a0bffd8f05cc7c002086c6ef20b7312ad9dcd" dependencies = [ "bumpalo", "log", @@ -1717,9 +1682,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.89" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0162dbf37223cd2afce98f3d0785506dcb8d266223983e4b5b525859e6e182b2" +checksum = "3e4c238561b2d428924c49815533a8b9121c664599558a5d9ec51f8a1740a999" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1727,9 +1692,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.89" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283" +checksum = "bae1abb6806dc1ad9e560ed242107c0f6c84335f1749dd4e8ddb012ebd5e25a7" dependencies = [ "proc-macro2", "quote", @@ -1740,15 +1705,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.89" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ab9b36309365056cd639da3134bf87fa8f3d86008abf99e612384a6eecd459f" +checksum = "4d91413b1c31d7539ba5ef2451af3f0b833a005eb27a631cec32bc0635a8602b" [[package]] name = "web-sys" -version = "0.3.66" +version = "0.3.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50c24a44ec86bb68fbecd1b3efed7e85ea5621b39b35ef2766b66cd984f8010f" +checksum = "58cd2333b6e0be7a39605f0e255892fd7418a682d8da8fe042fe25128794d2ed" dependencies = [ "js-sys", "wasm-bindgen", @@ -1772,9 +1737,9 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" dependencies = [ "winapi", ] @@ -1786,12 +1751,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] -name = "windows" -version = "0.48.0" +name = "windows-core" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets 0.48.5", + "windows-targets 0.52.0", ] [[package]] @@ -1983,11 +1948,10 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "2.0.8+zstd.1.5.5" +version = "2.0.9+zstd.1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c" +checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656" dependencies = [ "cc", - "libc", "pkg-config", ] diff --git a/Cargo.toml b/Cargo.toml index b0986983..21b3976e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,8 +12,7 @@ crate-type = ["cdylib"] pyo3 = { version = "0.20.2", features = ["extension-module", "anyhow"] } rayon = "1.8.1" serde = { version = "1.0.195", features = ["derive"] } -sourmash = { git = "https://github.com/sourmash-bio/sourmash", branch="select-downsample", features = ["branchwater"] } -#sourmash = { git = "https://github.com/sourmash-bio/sourmash", rev= "94b88cc314f781342721addc5ed35c531732a9b6", features = ["branchwater"] } +sourmash = { git = "https://github.com/sourmash-bio/sourmash", rev= "94b88cc314f781342721addc5ed35c531732a9b6", features = ["branchwater"] } #sourmash = { version = "0.12.0", features = ["branchwater"] } serde_json = "1.0.111" niffler = "2.4.0" From 86d6c1645bc1928bc808bf4b57820fb57667e3c2 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 23 Jan 2024 20:59:47 -0800 Subject: [PATCH 16/47] cleanup unused imports and code --- src/index.rs | 20 +------------------- src/mastiff_manygather.rs | 15 ++------------- src/mastiff_manysearch.rs | 12 +----------- 3 files changed, 4 insertions(+), 43 deletions(-) diff --git a/src/index.rs b/src/index.rs index 6768d058..23675614 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,20 +1,15 @@ -//use sourmash::index::revindex::RevIndex; +use camino::Utf8PathBuf as PathBuf; use sourmash::collection::Collection; use sourmash::index::revindex::RevIndex; -// use sourmash::index::revindex::{prepare_query, RevIndex, RevIndexOps}; use sourmash::manifest::Manifest; use sourmash::prelude::*; -// use sourmash::signature::{Signature, SigsTrait}; use sourmash::storage::{FSStorage, InnerStorage, ZipStorage}; -// use sourmash::sketch::Sketch; -use camino::Utf8PathBuf as PathBuf; use std::path::Path; use crate::utils::load_sketchlist_filenames; pub fn index>( siglist: PathBuf, - // template: Sketch, manifest: Option

, selection: Selection, output: P, @@ -23,19 +18,6 @@ pub fn index>( ) -> Result<(), Box> { println!("Loading siglist"); - // let (index_sigs, _temp_dir) = load_sigpaths_from_zip_or_pathlist(&siglist)?; - - // // if index_sigs pathlist is empty, bail - // if index_sigs.is_empty() { - // bail!("No signatures to index loaded, exiting."); - // } - - // // Create or open the RevIndex database with the provided output path and colors flag - // let db = RevIndex::create(output.as_ref(), colors); - - // // Index the signatures using the loaded template, threshold, and save_paths option - // db.index(index_sigs, &template, 0.0, save_paths); - let manifest = if let Some(m) = manifest { let rdr = std::fs::OpenOptions::new().read(true).open(m.as_ref())?; Some(Manifest::from_reader(rdr)?) diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index b581d73e..a8f78bd7 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -2,22 +2,13 @@ use anyhow::Result; use rayon::prelude::*; -use sourmash::ffi::signature; use sourmash::signature::Signature; -use sourmash::sketch::minhash::KmerMinHash; use sourmash::sketch::Sketch; use std::path::Path; -// use sourmash::collection::Collection; -// use sourmash::selection::Selection; use sourmash::prelude::*; -// use sourmash::index::revindex::{prepare_query, RevIndex, RevIndexOps}; -// use sourmash::manifest::Manifest; -// use sourmash::prelude::*; -// use sourmash::signature::{Signature, SigsTrait}; -// use sourmash::storage::{FSStorage, InnerStorage, ZipStorage}; -use sourmash::index::revindex::{prepare_query, RevIndex, RevIndexOps}; +use sourmash::index::revindex::{RevIndex, RevIndexOps}; use std::sync::atomic; use std::sync::atomic::AtomicUsize; @@ -25,7 +16,7 @@ use std::sync::atomic::AtomicUsize; use std::fs::File; use std::io::{BufWriter, Write}; -use crate::utils::{is_revindex_database, load_sigpaths_from_zip_or_pathlist, ReportType}; // prepare_query +use crate::utils::{is_revindex_database, load_sigpaths_from_zip_or_pathlist, ReportType}; pub fn mastiff_manygather>( queries_file: P, @@ -88,7 +79,6 @@ pub fn mastiff_manygather>( let send = query_paths .par_iter() .filter_map(|filename| { - // ... existing setup code ... let threshold = threshold_bp / selection.scaled()? as usize; match Signature::from_path(filename) { @@ -100,7 +90,6 @@ pub fn mastiff_manygather>( for sketch in query_sig.iter() { if let Sketch::MinHash(query) = sketch { found_compatible_sketch = true; - // eprintln!("query-size: {}", sketch.size()); // Gather! let (counter, query_colors, hash_to_color) = db.prepare_gather_counters(&query); diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index 654c1c17..40065d62 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -1,20 +1,10 @@ /// mastiff_manysearch: mastiff-indexed version of manysearch. use anyhow::Result; use rayon::prelude::*; - +use sourmash::index::revindex::{RevIndex, RevIndexOps}; use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::Sketch; use std::path::Path; - -// use sourmash::collection::Collection; -// use sourmash::index::revindex::{prepare_query, RevIndex, RevIndexOps}; -// use sourmash::manifest::Manifest; -// use sourmash::prelude::*; -// use sourmash::storage::{FSStorage, InnerStorage, ZipStorage}; - -// use sourmash::index::revindex::RevIndex; -use sourmash::index::revindex::{RevIndex, RevIndexOps}; - use std::sync::atomic; use std::sync::atomic::AtomicUsize; From 13940cd6a466e37fd6d13b2e0747b03b3048b9f7 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 24 Jan 2024 18:26:09 -0800 Subject: [PATCH 17/47] init use collection for query loading --- src/lib.rs | 2 - src/mastiff_manygather.rs | 147 +++++++++++++++++--------------------- src/utils.rs | 51 ++++++++++++- 3 files changed, 112 insertions(+), 88 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index a05d1094..1d6a227d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -122,7 +122,6 @@ fn do_fastmultigather( match mastiff_manygather::mastiff_manygather( query_filenames, siglist_path, - template, selection, threshold_bp, output_path, @@ -192,7 +191,6 @@ fn do_index( // match index::index(siglist, template, output, save_paths, colors) { // convert siglist to PathBuf // build template from ksize, scaled - let template = build_template(ksize, scaled, &moltype); let location = camino::Utf8PathBuf::from(siglist); let manifest = None; match index::index(location, manifest, selection, output, save_paths, colors) { diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index a8f78bd7..e50eefc3 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -6,6 +6,9 @@ use sourmash::signature::Signature; use sourmash::sketch::Sketch; use std::path::Path; +// use camino::Utf8Path as Path; +// use camino::Utf8PathBuf as PathBuf; + use sourmash::prelude::*; use sourmash::index::revindex::{RevIndex, RevIndexOps}; @@ -16,12 +19,11 @@ use std::sync::atomic::AtomicUsize; use std::fs::File; use std::io::{BufWriter, Write}; -use crate::utils::{is_revindex_database, load_sigpaths_from_zip_or_pathlist, ReportType}; +use crate::utils::{is_revindex_database, load_collection}; //, ReportType}; pub fn mastiff_manygather>( - queries_file: P, + queries_file: String, index: P, - template: Sketch, selection: Selection, threshold_bp: usize, output: Option

, @@ -36,10 +38,7 @@ pub fn mastiff_manygather>( let db = RevIndex::open(index.as_ref(), true)?; println!("Loaded DB"); - // Load query paths - let queryfile_name = queries_file.as_ref().to_string_lossy().to_string(); - let (query_paths, _temp_dir) = - load_sigpaths_from_zip_or_pathlist(&queries_file, &template, ReportType::Query)?; + let query_collection = load_collection(camino::Utf8PathBuf::from(queries_file), &selection)?; // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); @@ -76,90 +75,72 @@ pub fn mastiff_manygather>( let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - let send = query_paths - .par_iter() - .filter_map(|filename| { - let threshold = threshold_bp / selection.scaled()? as usize; - - match Signature::from_path(filename) { - Ok(mut signatures) if !signatures.is_empty() => { - match signatures.swap_remove(0).select(&selection) { - Ok(query_sig) => { - let mut results = vec![]; - let mut found_compatible_sketch = false; - for sketch in query_sig.iter() { - if let Sketch::MinHash(query) = sketch { - found_compatible_sketch = true; - // Gather! - let (counter, query_colors, hash_to_color) = - db.prepare_gather_counters(&query); - - let matches = db.gather( - counter, - query_colors, - hash_to_color, - threshold, - &query, - Some(selection.clone()), - ); - // extract results - if let Ok(matches) = matches { - for match_ in &matches { - results.push(( - query_sig.name().clone(), - query.md5sum().clone(), - match_.name().clone(), - match_.md5().clone(), - match_.f_match(), // f_match_query - match_.intersect_bp(), - )); // intersect_bp - } - } else { - eprintln!("Error gathering matches: {:?}", matches.err()); - } - } - } - if !found_compatible_sketch { - if !queryfile_name.ends_with(".zip") { - eprintln!( - "WARNING: no compatible sketches in path '{}'", - filename.display() - ); - } - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); + let send = query_collection + .par_iter() + .filter_map(|(idx, record)| { + let threshold = threshold_bp / selection.scaled()? as usize; + + match query_collection.sig_for_dataset(idx) { + Ok(query_sig) => { + let mut results = vec![]; + let mut found_compatible_sketch = false; + for sketch in query_sig.iter() { + if let Sketch::MinHash(query) = sketch { + found_compatible_sketch = true; + // Gather! + let (counter, query_colors, hash_to_color) = + db.prepare_gather_counters(&query); + + let matches = db.gather( + counter, + query_colors, + hash_to_color, + threshold, + &query, + Some(selection.clone()), + ); + // extract results + if let Ok(matches) = matches { + for match_ in &matches { + results.push(( + query_sig.name().clone(), + query.md5sum().clone(), + match_.name().clone(), + match_.md5().clone(), + match_.f_match(), // f_match_query + match_.intersect_bp(), + )); // intersect_bp } - - if results.is_empty() { - None - } else { - Some(results) - } - } - Err(err) => { - eprintln!("Error selecting sketches: {}", err); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - None + } else { + eprintln!("Error gathering matches: {:?}", matches.err()); } } } - Ok(_) => { - eprintln!("No signatures found in '{}'", filename.display()); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - None + if !found_compatible_sketch { + // if !queryfile_name.ends_with(".zip") { + // eprintln!( + // "WARNING: no compatible sketches in path '{}'", + // filename.display() + // ); + // } + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - Err(err) => { - eprintln!( - "WARNING: could not load sketches from path '{}': {}", - filename.display(), - err - ); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + + if results.is_empty() { None + } else { + Some(results) } } - }) - .flatten() - .try_for_each_with(send, |s, m| s.send(m)); + Err(err) => { + eprintln!("Error loading sketch: {}", err); + let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + None + } + } + }) + .flatten() + .try_for_each_with(send, |s, m| s.send(m)); // do some cleanup and error handling - if let Err(e) = send { diff --git a/src/utils.rs b/src/utils.rs index eefbaa14..64472ae2 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,6 +1,8 @@ /// Utility functions for sourmash_plugin_branchwater. use rayon::prelude::*; use sourmash::encodings::HashFunctions; +use sourmash::manifest::Manifest; +use sourmash::selection::Select; use std::fs::File; use std::io::Read; @@ -19,13 +21,13 @@ use anyhow::{anyhow, Result}; use std::cmp::{Ordering, PartialOrd}; -// use sourmash::prelude::FracMinHashOps; -// use sourmash::prelude::HashOps; use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; use sourmash::sketch::Sketch; +use sourmash::collection::Collection; +use sourmash::selection::Selection; + -// use tempfile::tempdir; /// Track a name/minhash. pub struct SmallSignature { @@ -225,6 +227,31 @@ pub fn load_sketchlist_filenames>(sketchlist_filename: &P) -> Res Ok(sketchlist_filenames) } +pub fn load_sketchlist_filenames_camino>(sketchlist_filename: &P) -> Result> { + let sketchlist_file = BufReader::new(File::open(sketchlist_filename)?); + + let mut sketchlist_filenames: Vec = Vec::new(); + for line in sketchlist_file.lines() { + let line = match line { + Ok(v) => v, + Err(_) => { + return { + let filename = sketchlist_filename.as_ref().display(); + let msg = format!("invalid line in fromfile '{}'", filename); + Err(anyhow!(msg)) + } + } + }; + + if !line.is_empty() { + let path = camino::Utf8PathBuf::from(line); + sketchlist_filenames.push(path); + } + } + Ok(sketchlist_filenames) +} + + /// Loads signature file paths from a ZIP archive. /// /// This function extracts the contents of a ZIP archive containing @@ -649,6 +676,7 @@ pub fn load_sketches_from_zip_or_pathlist>( .map(|ext| ext == "zip") .unwrap_or(false) { + load_sketches_from_zip(sketchlist_path, template)? } else { let sketch_paths = load_sketchlist_filenames(&sketchlist_path)?; @@ -660,6 +688,23 @@ pub fn load_sketches_from_zip_or_pathlist>( Ok(sketchlist) } +pub fn load_collection( + sigpath: camino::Utf8PathBuf, + selection: &Selection, +) -> Result { + let collection = if sigpath.extension().map_or(false, |ext| ext == "zip") { + Collection::from_zipfile(&sigpath)? + } else { + let sig_paths: Vec<_> = load_sketchlist_filenames_camino(&sigpath) + .unwrap_or_else(|_| panic!("Error loading siglist")) + .into_iter() + .collect(); + Collection::from_paths(&sig_paths)? + }; + // return collection records that match selection + Ok(collection.select(&selection)?) +} + /// Uses the output of sketch loading functions to report the /// total number of sketches loaded, as well as the number of files, /// if any, that failed to load or contained no compatible sketches. From cd8be99889c41489bcdc6104380033d1a6f95de1 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 25 Jan 2024 18:28:54 -0800 Subject: [PATCH 18/47] ...collection loading in progress --- src/check.rs | 10 +- src/fastgather.rs | 215 ++++++++++++++++++-------- src/fastmultigather.rs | 152 +++++++++--------- src/lib.rs | 83 +++++----- src/mastiff_manygather.rs | 25 ++- src/mastiff_manysearch.rs | 87 ++++++----- src/python/tests/test_multigather.py | 8 +- src/utils.rs | 222 +++++++++++++++++++-------- 8 files changed, 482 insertions(+), 320 deletions(-) diff --git a/src/check.rs b/src/check.rs index 7fea2eca..1311318c 100644 --- a/src/check.rs +++ b/src/check.rs @@ -1,19 +1,17 @@ -use std::path::Path; - use crate::utils::is_revindex_database; use sourmash::index::revindex::{RevIndex, RevIndexOps}; -pub fn check>(index: P, quick: bool) -> Result<(), Box> { - if !is_revindex_database(index.as_ref()) { +pub fn check(index: camino::Utf8PathBuf, quick: bool) -> Result<(), Box> { + if !is_revindex_database(&index) { bail!( "'{}' is not a valid RevIndex database", - index.as_ref().display() + index ); } println!("Opening DB"); - let db = RevIndex::open(index.as_ref(), true)?; + let db = RevIndex::open(index, true)?; println!("Starting check"); db.check(quick); diff --git a/src/fastgather.rs b/src/fastgather.rs index 963a6232..2fef7522 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -1,50 +1,39 @@ /// fastgather: Run gather with a query against a list of files. use anyhow::Result; -use sourmash::signature::Signature; use sourmash::sketch::Sketch; -use std::path::Path; +use sourmash::signature::Signature; +use sourmash::selection::Selection; +use camino; +use std::collections::BinaryHeap; +use crate::utils::PrefetchResult; use crate::utils::{ - consume_query_by_gather, load_sigpaths_from_zip_or_pathlist, load_sketches_above_threshold, - prepare_query, write_prefetch, ReportType, + consume_query_by_gather, load_sketches_above_threshold, write_prefetch, ReportType, load_collection }; -pub fn fastgather + std::fmt::Debug + std::fmt::Display + Clone>( - query_filename: P, - matchlist_filename: P, +pub fn fastgather( + query_filepath: camino::Utf8PathBuf, + against_filepath: camino::Utf8PathBuf, threshold_bp: usize, ksize: u8, scaled: usize, - template: Sketch, - gather_output: Option

, - prefetch_output: Option

, + selection: &Selection, + gather_output: Option, + prefetch_output: Option, ) -> Result<()> { - let location = query_filename.to_string(); - eprintln!("Loading query from '{}'", location); - let query = { - let sigs = Signature::from_path(query_filename)?; - - prepare_query(&sigs, &template, &location) - }; - // did we find anything matching the desired template? - let query = match query { - Some(query) => query, - None => bail!("No sketch found with scaled={}, k={}", scaled, ksize), - }; - - // build the list of paths to match against. - eprintln!( - "Loading matchlist from '{}'", - matchlist_filename.as_ref().display() - ); - let matchlist_filename = matchlist_filename.as_ref().to_string_lossy().to_string(); - let (matchlist_paths, _temp_dir) = - load_sigpaths_from_zip_or_pathlist(matchlist_filename, &template, ReportType::Against)?; + let query_collection = load_collection(&query_filepath, selection, ReportType::Query)?; - eprintln!("Loaded {} sig paths in matchlist", matchlist_paths.len()); + if query_collection.len() > 1 { + bail!("Found more than one compatible sketch from '{}'. Fastgather requires a single query sketch.", &query_filepath) + } + // build the list of paths to match against. + eprintln!("Loading matchlist from '{}'", against_filepath); + let against_collection = load_collection(&against_filepath, selection, ReportType::Against)?; + eprintln!("Loaded {} sig paths in matchlist", against_collection.len()); + // calculate the minimum number of hashes based on desired threshold let threshold_hashes: u64 = { let x = threshold_bp / scaled; @@ -60,41 +49,131 @@ pub fn fastgather + std::fmt::Debug + std::fmt::Display + Clone>( "using threshold overlap: {} {}", threshold_hashes, threshold_bp ); - - // load a set of sketches, filtering for those with overlaps > threshold - let result = load_sketches_above_threshold( - matchlist_paths, - &template, - &query.minhash, - threshold_hashes, - )?; - let matchlist = result.0; - let skipped_paths = result.1; - let failed_paths = result.2; - - if skipped_paths > 0 { - eprintln!( - "WARNING: skipped {} search paths - no compatible signatures.", - skipped_paths - ); - } - if failed_paths > 0 { - eprintln!( - "WARNING: {} search paths failed to load. See error messages above.", - failed_paths - ); - } - - if matchlist.is_empty() { - eprintln!("No search signatures loaded, exiting."); - return Ok(()); - } - - if prefetch_output.is_some() { - write_prefetch(&query, prefetch_output, &matchlist).ok(); - } - - // run the gather! - consume_query_by_gather(query, matchlist, threshold_hashes, gather_output).ok(); + query_collection.iter().for_each(|(idx, record)| { + // Load query sig + match query_collection.sig_for_dataset(idx) { + Ok(query_sig) => { + let location = query_sig.filename(); + let mut matchlist: BinaryHeap = BinaryHeap::new(); + let mut skipped_paths = 0; + let mut failed_paths = 0; + + for sketch in query_sig.iter() { + // Access query MinHash + if let Sketch::MinHash(query) = sketch { + let result = load_sketches_above_threshold( + against_collection, + &selection, + &query, + threshold_hashes, + ); + + match result { + Ok((loaded_matchlist, skipped, failed)) => { + matchlist.extend(loaded_matchlist); + skipped_paths += skipped; + failed_paths += failed; + } + Err(err) => { + eprintln!("Error loading sketches: {:?}", err); + failed_paths += 1; + } + } + } + } + + if skipped_paths > 0 { + eprintln!( + "WARNING: Skipped {} search paths - no compatible signatures.", + skipped_paths + ); + } + if failed_paths > 0 { + eprintln!( + "WARNING: {} search paths failed to load. See error messages above.", + failed_paths + ); + } + + if matchlist.is_empty() { + eprintln!("No search signatures loaded for '{}', exiting.", location); + return; // Return early if no search signatures loaded + } + + if let Some(prefetch_output) = &prefetch_output { + write_prefetch(&query_sig, Some(prefetch_output.clone()), &matchlist).ok(); + } + + // Run the gather! + if let Some(gather_output) = &gather_output { + if let Err(err) = consume_query_by_gather(query_sig, matchlist, threshold_hashes, Some(gather_output)) { + eprintln!("Error during gather: {:?}", err); + } + } + } + Err(_) => { + eprintln!("WARNING: Could not load query sketch '{}'", record.internal_location()); + } + } + }); Ok(()) } + +// query_collection.iter().for_each(|(idx, record)| { +// // Load query sig +// match query_collection.sig_for_dataset(idx) { +// Ok(query_sig) => { +// let location = query_sig.filename(); +// for sketch in query_sig.iter() { +// // Access query MinHash +// if let Sketch::MinHash(query) = sketch { +// let matchlist: BinaryHeap = sketchlist +// .par_iter() +// .filter_map(|sm| { +// // Call a function to load sketches above threshold +// let result = load_sketches_above_threshold( +// against_collection, +// &selection, +// &query, +// threshold_hashes, +// )?; +// let matchlist = result.0; +// let skipped_paths = result.1; +// let failed_paths = result.2; + +// if skipped_paths > 0 { +// eprintln!( +// "WARNING: skipped {} search paths - no compatible signatures.", +// skipped_paths +// ); +// } +// if failed_paths > 0 { +// eprintln!( +// "WARNING: {} search paths failed to load. See error messages above.", +// failed_paths +// ); +// } + +// if matchlist.is_empty() { +// eprintln!("No search signatures loaded, exiting."); +// return Ok(()); +// } + +// if prefetch_output.is_some() { +// write_prefetch(&query_sig, prefetch_output, &matchlist).ok(); +// } + +// // run the gather! +// consume_query_by_gather(query_sig, matchlist, threshold_hashes, gather_output).ok(); +// }); +// } +// } +// } +// } +// Err(_) => { +// eprintln!("WARNING: Could not load query sketch '{}'", record.internal_location()); +// } +// } +// }); +// Ok(()) +// } diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 915b6370..70850a37 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -2,32 +2,33 @@ use anyhow::Result; use rayon::prelude::*; -use sourmash::signature::Signature; +use sourmash::storage::SigStore; +use sourmash::{selection, signature::Signature}; use sourmash::sketch::Sketch; -use std::path::Path; +use sourmash::selection::Selection; use std::sync::atomic; use std::sync::atomic::AtomicUsize; use std::collections::BinaryHeap; +use camino::Utf8PathBuf; + use crate::utils::{ - consume_query_by_gather, load_sigpaths_from_zip_or_pathlist, - load_sketches_from_zip_or_pathlist, prepare_query, write_prefetch, PrefetchResult, ReportType, + consume_query_by_gather, load_collection, load_sigpaths_from_zip_or_pathlist, load_sketches_from_zip_or_pathlist, prepare_query, write_prefetch, PrefetchResult, ReportType }; -pub fn fastmultigather + std::fmt::Debug + Clone>( - query_filenames: P, - matchlist_filename: P, +pub fn fastmultigather( + query_filepath: camino::Utf8PathBuf, + against_filepath: camino::Utf8PathBuf, threshold_bp: usize, scaled: usize, - template: Sketch, + // template: Sketch, + selection: &Selection, ) -> Result<()> { // load the list of query paths - let queryfile_name = query_filenames.as_ref().to_string_lossy().to_string(); - let (querylist_paths, _temp_dir) = - load_sigpaths_from_zip_or_pathlist(&query_filenames, &template, ReportType::Query)?; - println!("Loaded {} sig paths in querylist", querylist_paths.len()); + let query_collection = load_collection(&query_filepath, selection, ReportType::Query)?; + println!("Loaded {} sig paths in querylist", query_collection.len()); let threshold_hashes: u64 = { let x = threshold_bp / scaled; @@ -43,82 +44,77 @@ pub fn fastmultigather + std::fmt::Debug + Clone>( println!("threshold overlap: {} {}", threshold_hashes, threshold_bp); // Load all the against sketches - let sketchlist = - load_sketches_from_zip_or_pathlist(&matchlist_filename, &template, ReportType::Against)?; + let against_collection = load_collection(&against_filepath, selection, ReportType::Against)?; + // load actual signatures + let mut sketchlist: Vec = vec![]; + + for (idx, record) in against_collection.iter() { + if let Ok(sig) = against_collection.sig_for_dataset(idx) { + sketchlist.push(sig); + } else { + eprintln!("Failed to load 'against' record: {}", record.name()); + } + } // Iterate over all queries => do prefetch and gather! let processed_queries = AtomicUsize::new(0); let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - querylist_paths.par_iter().for_each(|q| { - // increment counter of # of queries - let _i = processed_queries.fetch_add(1, atomic::Ordering::SeqCst); - - // set query_label to the last path element. - let location = q.clone().into_os_string().into_string().unwrap(); - let location = location.split('/').last().unwrap().to_string(); - - let query = match Signature::from_path(dbg!(q)) { - Ok(sigs) => { - let mm = prepare_query(&sigs, &template, &location); - - if mm.is_none() { - if !queryfile_name.ends_with(".zip") { - eprintln!("WARNING: no compatible sketches in path '{}'", q.display()); + query_collection.par_iter().for_each(|(idx, record)| { + // increment counter of # of queries. q: could we instead use the index from par_iter()? + let _i = processed_queries.fetch_add(1, atomic::Ordering::SeqCst); + // Load query sig + match query_collection.sig_for_dataset(idx) { + Ok(query_sig) => { + let location = query_sig.filename(); + for sketch in query_sig.iter() { + // Access query MinHash + if let Sketch::MinHash(query) = sketch { + let matchlist: BinaryHeap = sketchlist + .par_iter() + .filter_map(|sm| { + let mut mm = None; + // Access against MinHash + if let Some(sketch) = sm.sketches().get(0) { + if let Sketch::MinHash(against_sketch) = sketch { + if let Ok(overlap) = against_sketch.count_common(&query, false) { + if overlap >= threshold_hashes { + let result = PrefetchResult { + name: sm.name(), + md5sum: sm.md5sum().clone(), + minhash: against_sketch.clone(), + overlap, + }; + mm = Some(result); + } + } + } + } + mm + }) + .collect(); + if !matchlist.is_empty() { + let prefetch_output = format!("{}.prefetch.csv", location); + let gather_output = format!("{}.gather.csv", location); + + // Save initial list of matches to prefetch output + write_prefetch(&query_sig, Some(prefetch_output), &matchlist).ok(); + + // Now, do the gather! + consume_query_by_gather(query_sig.clone(), matchlist, threshold_hashes, Some(gather_output)).ok(); + } else { + println!("No matches to '{}'", location); } - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - mm - } - Err(err) => { - eprintln!("Sketch loading error: {}", err); - eprintln!( - "WARNING: could not load sketches from path '{}'", - q.display() - ); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - None - } - }; - - if let Some(query) = query { - // filter first set of matches out of sketchlist - let matchlist: BinaryHeap = sketchlist - .par_iter() - .filter_map(|sm| { - let mut mm = None; - - if let Ok(overlap) = sm.minhash.count_common(&query.minhash, false) { - if overlap >= threshold_hashes { - let result = PrefetchResult { - name: sm.name.clone(), - md5sum: sm.md5sum.clone(), - minhash: sm.minhash.clone(), - overlap, - }; - mm = Some(result); - } - } - mm - }) - .collect(); - - if !matchlist.is_empty() { - let prefetch_output = format!("{location}.prefetch.csv"); - let gather_output = format!("{location}.gather.csv"); - - // save initial list of matches to prefetch output - write_prefetch(&query, Some(prefetch_output), &matchlist).ok(); - - // now, do the gather! - consume_query_by_gather(query, matchlist, threshold_hashes, Some(gather_output)) - .ok(); - } else { - println!("No matches to '{}'", location); } } - }); + Err(_) => { + eprintln!("WARNING: no compatible sketches in path '{}'", record.internal_location()); + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); + } + } +}); println!( "Processed {} queries total.", diff --git a/src/lib.rs b/src/lib.rs index 1d6a227d..18f8e9de 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,11 +1,12 @@ /// Python interface Rust code for sourmash_plugin_branchwater. use pyo3::prelude::*; +use sourmash::selection; #[macro_use] extern crate simple_error; mod utils; -use crate::utils::build_template; +use crate::utils::{build_template, build_selection}; use crate::utils::is_revindex_database; mod check; mod fastgather; @@ -20,6 +21,8 @@ mod pairwise; use sourmash::encodings::HashFunctions; use sourmash::selection::Selection; +use camino::Utf8PathBuf; + #[pyfunction] fn do_manysearch( querylist_path: String, @@ -30,13 +33,19 @@ fn do_manysearch( moltype: String, output_path: Option, ) -> anyhow::Result { + + let queryfile_path: camino::Utf8PathBuf = querylist_path.clone().into(); + let againstfile_path: camino::Utf8PathBuf = siglist_path.clone().into(); + let selection = build_selection(ksize, scaled, &moltype); + // if siglist_path is revindex, run mastiff_manysearch; otherwise run manysearch let template = build_template(ksize, scaled, &moltype); - if is_revindex_database(siglist_path.as_ref()) { + if is_revindex_database(&againstfile_path) { + // if is_revindex_database(siglist_path.as_ref()) { match mastiff_manysearch::mastiff_manysearch( - querylist_path, - siglist_path, - template, + queryfile_path, + againstfile_path, + &selection, threshold, output_path, ) { @@ -74,14 +83,17 @@ fn do_fastgather( output_path_prefetch: Option, output_path_gather: Option, ) -> anyhow::Result { - let template = build_template(ksize, scaled, &moltype); + let queryfile_path: camino::Utf8PathBuf = query_filename.into(); + let againstfile_path: camino::Utf8PathBuf = siglist_path.into(); + let selection = build_selection(ksize, scaled, &moltype); + match fastgather::fastgather( - query_filename, - siglist_path, + queryfile_path, + againstfile_path, threshold_bp, ksize, scaled, - template, + &selection, output_path_prefetch, output_path_gather, ) { @@ -103,26 +115,17 @@ fn do_fastmultigather( moltype: String, output_path: Option, ) -> anyhow::Result { + + let queryfile_path: camino::Utf8PathBuf = query_filenames.into(); + let againstfile_path: camino::Utf8PathBuf = siglist_path.into(); + let selection = build_selection(ksize, scaled, &moltype); + // if a siglist path is a revindex, run mastiff_manygather. If not, run multigather - let template = build_template(ksize, scaled, &moltype); - if is_revindex_database(siglist_path.as_ref()) { - // build selection instead of template - let hash_function = match moltype.as_str() { - "dna" => HashFunctions::Murmur64Dna, - "protein" => HashFunctions::Murmur64Protein, - "dayhoff" => HashFunctions::Murmur64Dayhoff, - "hp" => HashFunctions::Murmur64Hp, - _ => panic!("Unknown molecule type: {}", moltype), - }; - let selection = Selection::builder() - .ksize(ksize.into()) - .scaled(scaled as u32) - .moltype(hash_function) - .build(); + if is_revindex_database(&againstfile_path) { match mastiff_manygather::mastiff_manygather( - query_filenames, - siglist_path, - selection, + queryfile_path, + againstfile_path, + &selection, threshold_bp, output_path, ) { @@ -134,11 +137,11 @@ fn do_fastmultigather( } } else { match fastmultigather::fastmultigather( - query_filenames, - siglist_path, + queryfile_path, + againstfile_path, threshold_bp, scaled, - template, + &selection, ) { Ok(_) => Ok(0), Err(e) => { @@ -176,21 +179,7 @@ fn do_index( save_paths: bool, colors: bool, ) -> anyhow::Result { - let hash_function = match moltype.as_str() { - "dna" => HashFunctions::Murmur64Dna, - "protein" => HashFunctions::Murmur64Protein, - "dayhoff" => HashFunctions::Murmur64Dayhoff, - "hp" => HashFunctions::Murmur64Hp, - _ => panic!("Unknown molecule type: {}", moltype), - }; - let selection = Selection::builder() - .ksize(ksize.into()) - .scaled(scaled as u32) - .moltype(hash_function) - .build(); - // match index::index(siglist, template, output, save_paths, colors) { - // convert siglist to PathBuf - // build template from ksize, scaled + let selection = build_selection(ksize, scaled, &moltype); let location = camino::Utf8PathBuf::from(siglist); let manifest = None; match index::index(location, manifest, selection, output, save_paths, colors) { @@ -204,7 +193,8 @@ fn do_index( #[pyfunction] fn do_check(index: String, quick: bool) -> anyhow::Result { - match check::check(index, quick) { + let idx: camino::Utf8PathBuf = index.into(); + match check::check(idx, quick) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); @@ -223,6 +213,7 @@ fn do_multisearch( moltype: String, output_path: Option, ) -> anyhow::Result { + // let selection = build_selection(ksize, scaled, &moltype); let template = build_template(ksize, scaled, &moltype); match multisearch::multisearch( querylist_path, diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index e50eefc3..19da5728 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -19,26 +19,27 @@ use std::sync::atomic::AtomicUsize; use std::fs::File; use std::io::{BufWriter, Write}; -use crate::utils::{is_revindex_database, load_collection}; //, ReportType}; +use crate::utils::{is_revindex_database, load_collection, ReportType}; + pub fn mastiff_manygather>( - queries_file: String, - index: P, - selection: Selection, + queries_file: camino::Utf8PathBuf, + index: camino::Utf8PathBuf, + selection: &Selection, threshold_bp: usize, output: Option

, ) -> Result<(), Box> { - if !is_revindex_database(index.as_ref()) { + if !is_revindex_database(&index) { bail!( "'{}' is not a valid RevIndex database", - index.as_ref().display() + index ); } // Open database once - let db = RevIndex::open(index.as_ref(), true)?; + let db = RevIndex::open(index, true)?; println!("Loaded DB"); - let query_collection = load_collection(camino::Utf8PathBuf::from(queries_file), &selection)?; + let query_collection = load_collection(&queries_file, selection, ReportType::Query)?; // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); @@ -81,6 +82,7 @@ pub fn mastiff_manygather>( let threshold = threshold_bp / selection.scaled()? as usize; match query_collection.sig_for_dataset(idx) { + // match query_collection.sig_from_record(record) { // to be added in core Ok(query_sig) => { let mut results = vec![]; let mut found_compatible_sketch = false; @@ -117,12 +119,7 @@ pub fn mastiff_manygather>( } } if !found_compatible_sketch { - // if !queryfile_name.ends_with(".zip") { - // eprintln!( - // "WARNING: no compatible sketches in path '{}'", - // filename.display() - // ); - // } + eprintln!("WARNING: no compatible sketches in path '{}'", query_sig.filename()); let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index 40065d62..9d4a45b0 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -4,39 +4,42 @@ use rayon::prelude::*; use sourmash::index::revindex::{RevIndex, RevIndexOps}; use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::Sketch; +use sourmash::selection::Selection; use std::path::Path; use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{ - csvwriter_thread, is_revindex_database, load_sigpaths_from_zip_or_pathlist, prepare_query, + csvwriter_thread, is_revindex_database, load_collection, prepare_query, ReportType, SearchResult, }; pub fn mastiff_manysearch>( - queries_file: P, - index: P, - template: Sketch, + queries_path: camino::Utf8PathBuf, + index: camino::Utf8PathBuf, + selection: &Selection, minimum_containment: f64, output: Option

, ) -> Result<(), Box> { - if !is_revindex_database(index.as_ref()) { + if !is_revindex_database(&index) { bail!( "'{}' is not a valid RevIndex database", - index.as_ref().display() + index ); } // Open database once - let db = RevIndex::open(index.as_ref(), true)?; + let db = RevIndex::open(index, true)?; println!("Loaded DB"); // Load query paths - let queryfile_name = queries_file.as_ref().to_string_lossy().to_string(); - let (query_paths, _temp_dir) = - load_sigpaths_from_zip_or_pathlist(&queries_file, &template, ReportType::Query)?; + let query_collection = load_collection(&queries_path, selection, ReportType::Query)?; - // if query_paths is empty, exit with error - if query_paths.is_empty() { + // let queryfile_name = queries_file.as_ref().to_string_lossy().to_string(); + // let (query_paths, _temp_dir) = + // load_sigpaths_from_zip_or_pathlist(&queries_file, &template, ReportType::Query)?; + + // if query_paths is empty, exit with error. this should already happen via load_collection, i think? + if query_collection.len() == 0 { bail!("No query signatures loaded, exiting."); } @@ -56,53 +59,49 @@ pub fn mastiff_manysearch>( let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - let send_result = query_paths + let send_result = query_collection .par_iter() - .filter_map(|filename| { + .filter_map(|(idx, record)| { let i = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst); if i % 1000 == 0 { eprintln!("Processed {} search sigs", i); } let mut results = vec![]; - - // load query signature from path: - match Signature::from_path(filename) { + match query_collection.sig_for_dataset(idx) { Ok(query_sig) => { - let location = filename.display().to_string(); - if let Some(query) = prepare_query(&query_sig, &template, &location) { - let query_size = query.minhash.size() as f64; - // search mastiff db - let counter = db.counter_for_query(&query.minhash); - let matches = - db.matches_from_counter(counter, minimum_containment as usize); + for sketch in query_sig.iter() { + if let Sketch::MinHash(query_mh) = sketch { + // let location = query_sig.filename(); + let query_size = query_mh.size(); + let counter = db.counter_for_query(&query_mh); + let matches = db.matches_from_counter(counter, minimum_containment as usize); // filter the matches for containment - for (path, overlap) in matches { - let containment = overlap as f64 / query_size; - if containment >= minimum_containment { - results.push(SearchResult { - query_name: query.name.clone(), - query_md5: query.md5sum.clone(), - match_name: path.clone(), - containment, - intersect_hashes: overlap, - match_md5: None, - jaccard: None, - max_containment: None, - }); + for (path, overlap) in matches { + let containment = overlap as f64 / query_size as f64; + if containment >= minimum_containment { + results.push(SearchResult { + query_name: query_sig.name(), + query_md5: query_sig.md5sum(), + match_name: path.clone(), + containment, + intersect_hashes: overlap, + match_md5: None, + jaccard: None, + max_containment: None, + }); + } } - } - } else { + + } else { // for reading zips, this is likely not a useful warning and // would show up too often (every sig is stored as individual file). - if !queryfile_name.ends_with(".zip") { eprintln!( - "WARNING: no compatible sketches in path '{}'", - filename.display() + "WARNING: no compatible sketches in path '{}'", query_sig.filename() ); - } let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); + } } if results.is_empty() { None @@ -115,7 +114,7 @@ pub fn mastiff_manysearch>( eprintln!("Sketch loading error: {}", err); eprintln!( "WARNING: could not load sketches from path '{}'", - filename.display() + record.internal_location() ); None } diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_multigather.py index 646e9309..26c85277 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_multigather.py @@ -183,7 +183,8 @@ def test_missing_querylist(runtmp, capfd, indexed, zip_query): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + # assert 'Error: failed to load query' in captured.err + assert 'Error: No such file or directory' in captured.err @pytest.mark.parametrize('indexed', [False, True]) @@ -239,7 +240,10 @@ def test_bad_query_2(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + if not indexed: + assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + else: + assert "InvalidArchive" in captured.err @pytest.mark.parametrize('indexed', [False, True]) diff --git a/src/utils.rs b/src/utils.rs index 64472ae2..4efe1fd9 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -17,8 +17,7 @@ use std::sync::atomic::AtomicUsize; use std::collections::BinaryHeap; -use anyhow::{anyhow, Result}; - +use anyhow::{anyhow, Result, Context}; use std::cmp::{Ordering, PartialOrd}; use sourmash::signature::{Signature, SigsTrait}; @@ -26,6 +25,8 @@ use sourmash::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; use sourmash::sketch::Sketch; use sourmash::collection::Collection; use sourmash::selection::Selection; +use sourmash::errors::SourmashError; +use sourmash::storage::SigStore; /// Track a name/minhash. @@ -172,9 +173,10 @@ pub fn prefetch( } /// Write list of prefetch matches. -pub fn write_prefetch + std::fmt::Debug + std::fmt::Display + Clone>( - query: &SmallSignature, - prefetch_output: Option

, +// pub fn write_prefetch + std::fmt::Debug + std::fmt::Display + Clone>( +pub fn write_prefetch( + query: &SigStore, + prefetch_output: Option, matchlist: &BinaryHeap, ) -> Result<()> { // Set up a writer for prefetch output @@ -193,7 +195,7 @@ pub fn write_prefetch + std::fmt::Debug + std::fmt::Display + Clo writeln!( &mut writer, "{},\"{}\",{},\"{}\",{},{}", - query.location, query.name, query.md5sum, m.name, m.md5sum, m.overlap + query.filename(), query.name(), query.md5sum(), m.name, m.md5sum, m.overlap ) .ok(); } @@ -464,55 +466,49 @@ pub fn load_sketches( /// those with a minimum overlap. pub fn load_sketches_above_threshold( - sketchlist_paths: Vec, - template: &Sketch, + against_collection: Collection, + selection: &Selection, query: &KmerMinHash, threshold_hashes: u64, ) -> Result<(BinaryHeap, usize, usize)> { let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - let matchlist: BinaryHeap = sketchlist_paths - .par_iter() - .filter_map(|m| { - let sigs = Signature::from_path(m); - let location = m.display().to_string(); - - match sigs { - Ok(sigs) => { - let mut mm = None; - - if let Some(sm) = prepare_query(&sigs, template, &location) { - let mh = sm.minhash; - if let Ok(overlap) = mh.count_common(query, false) { - if overlap >= threshold_hashes { - let result = PrefetchResult { - name: sm.name, - md5sum: sm.md5sum, - minhash: mh, - overlap, - }; - mm = Some(result); - } + let matchlist: BinaryHeap = against_collection + .par_iter() + .filter_map(|(idx, against_record)| { + let mut mm = None; + // Load against into memory + if let Ok(against_sig) = against_collection.sig_for_dataset(idx) { + if let Some(sketch) = against_sig.sketches().get(0) { + if let Sketch::MinHash(against_mh) = sketch { + if let Ok(overlap) = against_mh.count_common(query, false) { + if overlap >= threshold_hashes { + let result = PrefetchResult { + name: against_sig.name().to_string(), + md5sum: against_mh.md5sum().to_string(), + minhash: against_mh.clone(), + overlap, + }; + mm = Some(result); } - } else { - eprintln!("WARNING: no compatible sketches in path '{}'", m.display()); - let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - mm - } - Err(err) => { - eprintln!("Sketch loading error: {}", err); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - eprintln!( - "WARNING: could not load sketches from path '{}'", - m.display() - ); - None + } else { + eprintln!("WARNING: no compatible sketches in path '{}'", against_sig.filename()); + let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } + } else { + eprintln!("WARNING: no compatible sketches in path '{}'", against_sig.filename()); + let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - }) - .collect(); + } else { + // this shouldn't happen here anymore -- likely would happen at load_collection + eprintln!("WARNING: could not load sketches for record '{}'", against_record.internal_location()); + let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); + } + mm + }) + .collect(); let skipped_paths = skipped_paths.load(atomic::Ordering::SeqCst); let failed_paths = failed_paths.load(atomic::Ordering::SeqCst); @@ -689,22 +685,100 @@ pub fn load_sketches_from_zip_or_pathlist>( } pub fn load_collection( - sigpath: camino::Utf8PathBuf, + sigpath: &camino::Utf8PathBuf, selection: &Selection, + report_type: ReportType, ) -> Result { + if !sigpath.exists() { + bail!("No such file or directory: '{}'", sigpath); + } let collection = if sigpath.extension().map_or(false, |ext| ext == "zip") { - Collection::from_zipfile(&sigpath)? + match Collection::from_zipfile(&sigpath) { + Ok(collection) => collection, + Err(_) => { + bail!("failed to load {} zipfile: '{}'", report_type, sigpath); + } + } } else { - let sig_paths: Vec<_> = load_sketchlist_filenames_camino(&sigpath) - .unwrap_or_else(|_| panic!("Error loading siglist")) - .into_iter() - .collect(); - Collection::from_paths(&sig_paths)? + let sig_paths = load_sketchlist_filenames_camino(&sigpath)?; + match Collection::from_paths(&sig_paths) { + Ok(collection) => collection, + Err(_) => { + bail!("failed to load {} signature paths: '{}'", report_type, sigpath); + } + } }; - // return collection records that match selection - Ok(collection.select(&selection)?) + + let n_total = collection.len(); + let selected = collection.select(&selection)?; + let n_skipped = n_total - selected.len(); + let n_failed = 0; // TODO: can we get list / number of failed paths from core??? + report_on_collection_loading(&selected, n_skipped, n_failed, report_type)?; + Ok(selected) +} + +pub fn report_on_collection_loading( + collection: &Collection, + skipped_paths: usize, + failed_paths: usize, + report_type: ReportType, +) -> Result<()> { + if failed_paths > 0 { + eprintln!( + "WARNING: {} {} paths failed to load. See error messages above.", + failed_paths, report_type + ); + } + if skipped_paths > 0 { + eprintln!( + "WARNING: skipped {} {} paths - no compatible signatures.", + skipped_paths, report_type + ); + } + + // Validate sketches + if collection.is_empty() { + bail!("No {} signatures loaded, exiting.", report_type); + } + eprintln!("Loaded {} {} signature(s)", collection.len(), report_type); + Ok(()) } +pub fn load_single_sig_from_collection( + query_collection: &Collection, // Replace with the actual type + selection: &Selection, +) -> Result { + let scaled = selection.scaled().unwrap(); + let ksize = selection.ksize().unwrap(); + + match query_collection.sig_for_dataset(0) { + Ok(sig) => Ok(sig), + Err(_) => Err(anyhow::anyhow!("No sketch found with scaled={}, k={}", scaled, ksize)), + } +} + +// pub fn load_single_sketch_from_sig<'a>(sig: &'a SigStore, selection: &'a Selection) -> Result<&'a KmerMinHash> { +// let sketch = sig.sketches().get(0).ok_or_else(|| { +// anyhow::anyhow!("No sketch found with scaled={}, k={}", selection.scaled().unwrap_or_default(), selection.ksize().unwrap_or_default()) +// })?; + +// if let Sketch::MinHash(mh) = sketch { +// Ok(mh) +// } else { +// Err(anyhow::anyhow!("No sketch found with scaled={}, k={}", selection.scaled().unwrap_or_default(), selection.ksize().unwrap_or_default())) +// } +// } + +// pub fn load_single_sig_and_sketch<'a>( +// query_collection: &'a Collection, +// selection: &'a Selection, +// ) -> Result<(SigStore, &'a KmerMinHash)> { +// let sig = load_single_sig_from_collection(query_collection, selection)?; +// let sketch = load_single_sketch_from_sig(&sig, selection)?; +// Ok((sig, sketch)) +// } + + /// Uses the output of sketch loading functions to report the /// total number of sketches loaded, as well as the number of files, /// if any, that failed to load or contained no compatible sketches. @@ -758,7 +832,7 @@ pub fn report_on_sketch_loading( /// removing matches in 'matchlist' from 'query'. pub fn consume_query_by_gather + std::fmt::Debug + std::fmt::Display + Clone>( - query: SmallSignature, + query: SigStore, matchlist: BinaryHeap, threshold_hashes: u64, gather_output: Option

, @@ -778,17 +852,25 @@ pub fn consume_query_by_gather + std::fmt::Debug + std::fmt::Disp let mut matching_sketches = matchlist; let mut rank = 0; - let mut last_hashes = query.minhash.size(); + let mut last_hashes = query.size(); let mut last_matches = matching_sketches.len(); - let location = query.location; - let mut query_mh = query.minhash; + // let location = query.location; + let location = query.filename(); + // let mut query_mh = query.minhash; + + let sketches = query.sketches(); + let orig_query_mh = match sketches.get(0) { + Some(Sketch::MinHash(mh)) => Ok(mh), + _ => Err(anyhow::anyhow!("No MinHash found")), + }?; + let mut query_mh = orig_query_mh.clone(); eprintln!( "{} iter {}: start: query hashes={} matches={}", location, rank, - query_mh.size(), + query.size(), matching_sketches.len() ); @@ -803,8 +885,8 @@ pub fn consume_query_by_gather + std::fmt::Debug + std::fmt::Disp "{},{},\"{}\",{},\"{}\",{},{}", location, rank, - query.name, - query.md5sum, + query.name(), + query.md5sum(), best_element.name, best_element.md5sum, best_element.overlap @@ -855,7 +937,23 @@ pub fn build_template(ksize: u8, scaled: usize, moltype: &str) -> Sketch { Sketch::MinHash(template_mh) } -pub fn is_revindex_database(path: &Path) -> bool { +pub fn build_selection(ksize: u8, scaled: usize, moltype: &str) -> Selection { + let hash_function = match moltype { + "dna" => HashFunctions::Murmur64Dna, + "protein" => HashFunctions::Murmur64Protein, + "dayhoff" => HashFunctions::Murmur64Dayhoff, + "hp" => HashFunctions::Murmur64Hp, + _ => panic!("Unknown molecule type: {}", moltype), + }; + + Selection::builder() + .ksize(ksize.into()) + .scaled(scaled as u32) + .moltype(hash_function) + .build() +} + +pub fn is_revindex_database(path: &camino::Utf8PathBuf) -> bool { // quick file check for Revindex database: // is path a directory that contains a file named 'CURRENT'? if path.is_dir() { From 32fc2d5a5dcda69ea60229e70b208f4602e2004c Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 26 Jan 2024 12:18:36 -0800 Subject: [PATCH 19/47] fix fastgather --- src/fastgather.rs | 187 +++++++++++++++------------------------------- 1 file changed, 60 insertions(+), 127 deletions(-) diff --git a/src/fastgather.rs b/src/fastgather.rs index 2fef7522..2ff13509 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -28,6 +28,30 @@ pub fn fastgather( if query_collection.len() > 1 { bail!("Found more than one compatible sketch from '{}'. Fastgather requires a single query sketch.", &query_filepath) } + // load query sig into memory + let mut query_mh = None; + let mut query_sig = None; + for (idx, _record) in query_collection.iter() { + // Load query sig + match query_collection.sig_for_dataset(idx) { + Ok(query_sig) => { + for sketch in query_sig.iter() { + // Access query MinHash + if let Sketch::MinHash(query) = sketch { + query_mh = Some(query.clone()); + break; + } + } + } + Err(_) => { + bail!("No query sketch matching selection parameters.") // should not get here bc we already check this during collection loading? + } + } + + if query_mh.is_some() { + break; // Exit the loop if we found a MinHash sketch + } + } // build the list of paths to match against. eprintln!("Loading matchlist from '{}'", against_filepath); @@ -49,131 +73,40 @@ pub fn fastgather( "using threshold overlap: {} {}", threshold_hashes, threshold_bp ); - query_collection.iter().for_each(|(idx, record)| { - // Load query sig - match query_collection.sig_for_dataset(idx) { - Ok(query_sig) => { - let location = query_sig.filename(); - let mut matchlist: BinaryHeap = BinaryHeap::new(); - let mut skipped_paths = 0; - let mut failed_paths = 0; - - for sketch in query_sig.iter() { - // Access query MinHash - if let Sketch::MinHash(query) = sketch { - let result = load_sketches_above_threshold( - against_collection, - &selection, - &query, - threshold_hashes, - ); - - match result { - Ok((loaded_matchlist, skipped, failed)) => { - matchlist.extend(loaded_matchlist); - skipped_paths += skipped; - failed_paths += failed; - } - Err(err) => { - eprintln!("Error loading sketches: {:?}", err); - failed_paths += 1; - } - } - } - } - - if skipped_paths > 0 { - eprintln!( - "WARNING: Skipped {} search paths - no compatible signatures.", - skipped_paths - ); - } - if failed_paths > 0 { - eprintln!( - "WARNING: {} search paths failed to load. See error messages above.", - failed_paths - ); - } - - if matchlist.is_empty() { - eprintln!("No search signatures loaded for '{}', exiting.", location); - return; // Return early if no search signatures loaded - } - - if let Some(prefetch_output) = &prefetch_output { - write_prefetch(&query_sig, Some(prefetch_output.clone()), &matchlist).ok(); - } - - // Run the gather! - if let Some(gather_output) = &gather_output { - if let Err(err) = consume_query_by_gather(query_sig, matchlist, threshold_hashes, Some(gather_output)) { - eprintln!("Error during gather: {:?}", err); - } - } - } - Err(_) => { - eprintln!("WARNING: Could not load query sketch '{}'", record.internal_location()); - } - } - }); - Ok(()) -} -// query_collection.iter().for_each(|(idx, record)| { -// // Load query sig -// match query_collection.sig_for_dataset(idx) { -// Ok(query_sig) => { -// let location = query_sig.filename(); -// for sketch in query_sig.iter() { -// // Access query MinHash -// if let Sketch::MinHash(query) = sketch { -// let matchlist: BinaryHeap = sketchlist -// .par_iter() -// .filter_map(|sm| { -// // Call a function to load sketches above threshold -// let result = load_sketches_above_threshold( -// against_collection, -// &selection, -// &query, -// threshold_hashes, -// )?; -// let matchlist = result.0; -// let skipped_paths = result.1; -// let failed_paths = result.2; - -// if skipped_paths > 0 { -// eprintln!( -// "WARNING: skipped {} search paths - no compatible signatures.", -// skipped_paths -// ); -// } -// if failed_paths > 0 { -// eprintln!( -// "WARNING: {} search paths failed to load. See error messages above.", -// failed_paths -// ); -// } - -// if matchlist.is_empty() { -// eprintln!("No search signatures loaded, exiting."); -// return Ok(()); -// } - -// if prefetch_output.is_some() { -// write_prefetch(&query_sig, prefetch_output, &matchlist).ok(); -// } - -// // run the gather! -// consume_query_by_gather(query_sig, matchlist, threshold_hashes, gather_output).ok(); -// }); -// } -// } -// } -// } -// Err(_) => { -// eprintln!("WARNING: Could not load query sketch '{}'", record.internal_location()); -// } -// } -// }); -// Ok(()) -// } + // load a set of sketches, filtering for those with overlaps > threshold + let result = load_sketches_above_threshold( + against_collection, + &selection, + &query_mh.unwrap(), + threshold_hashes, + )?; + let matchlist = result.0; + let skipped_paths = result.1; + let failed_paths = result.2; + if skipped_paths > 0 { + eprintln!( + "WARNING: skipped {} search paths - no compatible signatures.", + skipped_paths + ); + } + if failed_paths > 0 { + eprintln!( + "WARNING: {} search paths failed to load. See error messages above.", + failed_paths + ); + } + + if matchlist.is_empty() { + eprintln!("No search signatures loaded, exiting."); + return Ok(()); + } + + if prefetch_output.is_some() { + write_prefetch(query_sig.as_ref().unwrap(), prefetch_output, &matchlist).ok(); + } + + // run the gather! + consume_query_by_gather(query_sig.clone().unwrap(), matchlist, threshold_hashes, gather_output).ok(); + Ok(()) +} \ No newline at end of file From 39fb7dceb8454dcccb90e4848bed1e8e7429e3de Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 26 Jan 2024 14:04:57 -0800 Subject: [PATCH 20/47] re-enable more permissive pathlist loading --- src/python/tests/test_search.py | 4 +- src/utils.rs | 169 +++++++++++++++++++------------- 2 files changed, 101 insertions(+), 72 deletions(-) diff --git a/src/python/tests/test_search.py b/src/python/tests/test_search.py index 5427d303..6f29ec9b 100644 --- a/src/python/tests/test_search.py +++ b/src/python/tests/test_search.py @@ -247,7 +247,7 @@ def test_missing_query(runtmp, capfd, indexed, zip_query): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err @pytest.mark.parametrize("indexed", [False, True]) @@ -273,7 +273,7 @@ def test_bad_query(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid line in fromfile ' in captured.err + assert 'Error: invalid line in fromfile' in captured.err @pytest.mark.parametrize("indexed", [False, True]) diff --git a/src/utils.rs b/src/utils.rs index 4efe1fd9..7824113f 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -7,6 +7,7 @@ use sourmash::selection::Select; use std::fs::File; use std::io::Read; use std::io::{BufRead, BufReader, BufWriter, Write}; +use std::panic; use std::path::{Path, PathBuf}; use tempfile::tempdir; @@ -17,17 +18,17 @@ use std::sync::atomic::AtomicUsize; use std::collections::BinaryHeap; -use anyhow::{anyhow, Result, Context}; +use anyhow::{anyhow, Context, Result}; use std::cmp::{Ordering, PartialOrd}; +use sourmash::collection::{self, Collection}; +use sourmash::errors::SourmashError; +use sourmash::manifest::Record; +use sourmash::selection::Selection; use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; use sourmash::sketch::Sketch; -use sourmash::collection::Collection; -use sourmash::selection::Selection; -use sourmash::errors::SourmashError; -use sourmash::storage::SigStore; - +use sourmash::storage::{FSStorage, InnerStorage, SigStore}; /// Track a name/minhash. @@ -195,7 +196,12 @@ pub fn write_prefetch( writeln!( &mut writer, "{},\"{}\",{},\"{}\",{},{}", - query.filename(), query.name(), query.md5sum(), m.name, m.md5sum, m.overlap + query.filename(), + query.name(), + query.md5sum(), + m.name, + m.md5sum, + m.overlap ) .ok(); } @@ -229,31 +235,6 @@ pub fn load_sketchlist_filenames>(sketchlist_filename: &P) -> Res Ok(sketchlist_filenames) } -pub fn load_sketchlist_filenames_camino>(sketchlist_filename: &P) -> Result> { - let sketchlist_file = BufReader::new(File::open(sketchlist_filename)?); - - let mut sketchlist_filenames: Vec = Vec::new(); - for line in sketchlist_file.lines() { - let line = match line { - Ok(v) => v, - Err(_) => { - return { - let filename = sketchlist_filename.as_ref().display(); - let msg = format!("invalid line in fromfile '{}'", filename); - Err(anyhow!(msg)) - } - } - }; - - if !line.is_empty() { - let path = camino::Utf8PathBuf::from(line); - sketchlist_filenames.push(path); - } - } - Ok(sketchlist_filenames) -} - - /// Loads signature file paths from a ZIP archive. /// /// This function extracts the contents of a ZIP archive containing @@ -475,40 +456,49 @@ pub fn load_sketches_above_threshold( let failed_paths = AtomicUsize::new(0); let matchlist: BinaryHeap = against_collection - .par_iter() - .filter_map(|(idx, against_record)| { - let mut mm = None; - // Load against into memory - if let Ok(against_sig) = against_collection.sig_for_dataset(idx) { - if let Some(sketch) = against_sig.sketches().get(0) { - if let Sketch::MinHash(against_mh) = sketch { - if let Ok(overlap) = against_mh.count_common(query, false) { - if overlap >= threshold_hashes { - let result = PrefetchResult { - name: against_sig.name().to_string(), - md5sum: against_mh.md5sum().to_string(), - minhash: against_mh.clone(), - overlap, - }; - mm = Some(result); + .par_iter() + .filter_map(|(idx, against_record)| { + let mut mm = None; + // Load against into memory + if let Ok(against_sig) = against_collection.sig_for_dataset(idx) { + if let Some(sketch) = against_sig.sketches().get(0) { + if let Sketch::MinHash(against_mh) = sketch { + if let Ok(overlap) = against_mh.count_common(query, false) { + if overlap >= threshold_hashes { + let result = PrefetchResult { + name: against_sig.name().to_string(), + md5sum: against_mh.md5sum().to_string(), + minhash: against_mh.clone(), + overlap, + }; + mm = Some(result); + } } + } else { + eprintln!( + "WARNING: no compatible sketches in path '{}'", + against_sig.filename() + ); + let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } } else { - eprintln!("WARNING: no compatible sketches in path '{}'", against_sig.filename()); + eprintln!( + "WARNING: no compatible sketches in path '{}'", + against_sig.filename() + ); let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } } else { - eprintln!("WARNING: no compatible sketches in path '{}'", against_sig.filename()); + // this shouldn't happen here anymore -- likely would happen at load_collection + eprintln!( + "WARNING: could not load sketches for record '{}'", + against_record.internal_location() + ); let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - } else { - // this shouldn't happen here anymore -- likely would happen at load_collection - eprintln!("WARNING: could not load sketches for record '{}'", against_record.internal_location()); - let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); - } - mm - }) - .collect(); + mm + }) + .collect(); let skipped_paths = skipped_paths.load(atomic::Ordering::SeqCst); let failed_paths = failed_paths.load(atomic::Ordering::SeqCst); @@ -672,7 +662,6 @@ pub fn load_sketches_from_zip_or_pathlist>( .map(|ext| ext == "zip") .unwrap_or(false) { - load_sketches_from_zip(sketchlist_path, template)? } else { let sketch_paths = load_sketchlist_filenames(&sketchlist_path)?; @@ -692,6 +681,8 @@ pub fn load_collection( if !sigpath.exists() { bail!("No such file or directory: '{}'", sigpath); } + + let mut n_failed = 0; let collection = if sigpath.extension().map_or(false, |ext| ext == "zip") { match Collection::from_zipfile(&sigpath) { Ok(collection) => collection, @@ -700,19 +691,54 @@ pub fn load_collection( } } } else { - let sig_paths = load_sketchlist_filenames_camino(&sigpath)?; - match Collection::from_paths(&sig_paths) { - Ok(collection) => collection, - Err(_) => { - bail!("failed to load {} signature paths: '{}'", report_type, sigpath); - } - } + let sketchlist_file = BufReader::new(File::open(sigpath)?); + + let records: Vec = sketchlist_file + .lines() + .filter_map(|line| { + let path = match line { + Ok(path) => path, + Err(err) => { + eprintln!("Error: invalid line in fromfile"); + return None; // Skip + } + }; + + match Signature::from_path(&path) { + Ok(signatures) => { + let recs: Vec = signatures + .into_iter() + .flat_map(|v| Record::from_sig(&v, path.as_str())) + .collect(); + Some(recs) + } + Err(err) => { + eprintln!("Sketch loading error: {}", err); + eprintln!("WARNING: could not load sketches from path '{}'", path); + n_failed += 1; + None + } + } + }) + .flatten() + .collect(); + + let manifest: Manifest = records.into(); + + Collection::new( + manifest, + InnerStorage::new( + FSStorage::builder() + .fullpath("".into()) + .subdir("".into()) + .build(), + ), + ) }; let n_total = collection.len(); let selected = collection.select(&selection)?; let n_skipped = n_total - selected.len(); - let n_failed = 0; // TODO: can we get list / number of failed paths from core??? report_on_collection_loading(&selected, n_skipped, n_failed, report_type)?; Ok(selected) } @@ -753,7 +779,11 @@ pub fn load_single_sig_from_collection( match query_collection.sig_for_dataset(0) { Ok(sig) => Ok(sig), - Err(_) => Err(anyhow::anyhow!("No sketch found with scaled={}, k={}", scaled, ksize)), + Err(_) => Err(anyhow::anyhow!( + "No sketch found with scaled={}, k={}", + scaled, + ksize + )), } } @@ -778,7 +808,6 @@ pub fn load_single_sig_from_collection( // Ok((sig, sketch)) // } - /// Uses the output of sketch loading functions to report the /// total number of sketches loaded, as well as the number of files, /// if any, that failed to load or contained no compatible sketches. @@ -945,7 +974,7 @@ pub fn build_selection(ksize: u8, scaled: usize, moltype: &str) -> Selection { "hp" => HashFunctions::Murmur64Hp, _ => panic!("Unknown molecule type: {}", moltype), }; - + Selection::builder() .ksize(ksize.into()) .scaled(scaled as u32) From 15f7dba7a54828064fef8c1e8a00ee28b83561a1 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 26 Jan 2024 14:07:51 -0800 Subject: [PATCH 21/47] clean up ms --- src/mastiff_manysearch.rs | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index 9d4a45b0..24fff34e 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -2,16 +2,15 @@ use anyhow::Result; use rayon::prelude::*; use sourmash::index::revindex::{RevIndex, RevIndexOps}; -use sourmash::signature::{Signature, SigsTrait}; -use sourmash::sketch::Sketch; use sourmash::selection::Selection; +use sourmash::signature::SigsTrait; +use sourmash::sketch::Sketch; use std::path::Path; use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{ - csvwriter_thread, is_revindex_database, load_collection, prepare_query, - ReportType, SearchResult, + csvwriter_thread, is_revindex_database, load_collection, ReportType, SearchResult, }; pub fn mastiff_manysearch>( @@ -22,10 +21,7 @@ pub fn mastiff_manysearch>( output: Option

, ) -> Result<(), Box> { if !is_revindex_database(&index) { - bail!( - "'{}' is not a valid RevIndex database", - index - ); + bail!("'{}' is not a valid RevIndex database", index); } // Open database once let db = RevIndex::open(index, true)?; @@ -34,10 +30,6 @@ pub fn mastiff_manysearch>( // Load query paths let query_collection = load_collection(&queries_path, selection, ReportType::Query)?; - // let queryfile_name = queries_file.as_ref().to_string_lossy().to_string(); - // let (query_paths, _temp_dir) = - // load_sigpaths_from_zip_or_pathlist(&queries_file, &template, ReportType::Query)?; - // if query_paths is empty, exit with error. this should already happen via load_collection, i think? if query_collection.len() == 0 { bail!("No query signatures loaded, exiting."); @@ -75,9 +67,10 @@ pub fn mastiff_manysearch>( // let location = query_sig.filename(); let query_size = query_mh.size(); let counter = db.counter_for_query(&query_mh); - let matches = db.matches_from_counter(counter, minimum_containment as usize); + let matches = + db.matches_from_counter(counter, minimum_containment as usize); - // filter the matches for containment + // filter the matches for containment for (path, overlap) in matches { let containment = overlap as f64 / query_size as f64; if containment >= minimum_containment { @@ -93,14 +86,12 @@ pub fn mastiff_manysearch>( }); } } - } else { - // for reading zips, this is likely not a useful warning and - // would show up too often (every sig is stored as individual file). eprintln!( - "WARNING: no compatible sketches in path '{}'", query_sig.filename() + "WARNING: no compatible sketches in path '{}'", + query_sig.filename() ); - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } } if results.is_empty() { @@ -159,7 +150,5 @@ pub fn mastiff_manysearch>( ); } - // _temp_dir goes out of scope => is deleted. - Ok(()) } From 4e3b7ee297f6fe97813ed862a80e9246b192654c Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 26 Jan 2024 14:17:52 -0800 Subject: [PATCH 22/47] harmonize errors --- src/python/tests/test_multigather.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_multigather.py index 26c85277..3f59278c 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_multigather.py @@ -208,7 +208,7 @@ def test_bad_query(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid line in fromfile ' in captured.err + assert 'Error: invalid line in fromfile' in captured.err @pytest.mark.parametrize('indexed', [False, True]) @@ -240,10 +240,7 @@ def test_bad_query_2(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - if not indexed: - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err - else: - assert "InvalidArchive" in captured.err + assert "InvalidArchive" in captured.err @pytest.mark.parametrize('indexed', [False, True]) From 14ee1bdbce51f2c52843995ec79cd324c5622251 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 26 Jan 2024 16:16:02 -0800 Subject: [PATCH 23/47] harmonize error text and output filenames --- src/fastmultigather.rs | 115 ++++++++++++++++----------- src/python/tests/test_multigather.py | 55 ++++++------- src/utils.rs | 62 ++++++++++----- 3 files changed, 130 insertions(+), 102 deletions(-) diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 70850a37..71a0e174 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -2,20 +2,22 @@ use anyhow::Result; use rayon::prelude::*; +use serde::Serialize; +use sourmash::selection::Selection; +use sourmash::sketch::Sketch; use sourmash::storage::SigStore; use sourmash::{selection, signature::Signature}; -use sourmash::sketch::Sketch; -use sourmash::selection::Selection; use std::sync::atomic; use std::sync::atomic::AtomicUsize; use std::collections::BinaryHeap; -use camino::Utf8PathBuf; +use camino::{Utf8Path, Utf8PathBuf}; use crate::utils::{ - consume_query_by_gather, load_collection, load_sigpaths_from_zip_or_pathlist, load_sketches_from_zip_or_pathlist, prepare_query, write_prefetch, PrefetchResult, ReportType + consume_query_by_gather, load_collection, load_sigpaths_from_zip_or_pathlist, + load_sketches_from_zip_or_pathlist, prepare_query, write_prefetch, PrefetchResult, ReportType, }; pub fn fastmultigather( @@ -23,7 +25,6 @@ pub fn fastmultigather( against_filepath: camino::Utf8PathBuf, threshold_bp: usize, scaled: usize, - // template: Sketch, selection: &Selection, ) -> Result<()> { // load the list of query paths @@ -63,58 +64,76 @@ pub fn fastmultigather( query_collection.par_iter().for_each(|(idx, record)| { // increment counter of # of queries. q: could we instead use the index from par_iter()? - let _i = processed_queries.fetch_add(1, atomic::Ordering::SeqCst); + let _i = processed_queries.fetch_add(1, atomic::Ordering::SeqCst); // Load query sig - match query_collection.sig_for_dataset(idx) { - Ok(query_sig) => { - let location = query_sig.filename(); - for sketch in query_sig.iter() { - // Access query MinHash - if let Sketch::MinHash(query) = sketch { - let matchlist: BinaryHeap = sketchlist - .par_iter() - .filter_map(|sm| { - let mut mm = None; - // Access against MinHash - if let Some(sketch) = sm.sketches().get(0) { - if let Sketch::MinHash(against_sketch) = sketch { - if let Ok(overlap) = against_sketch.count_common(&query, false) { - if overlap >= threshold_hashes { - let result = PrefetchResult { - name: sm.name(), - md5sum: sm.md5sum().clone(), - minhash: against_sketch.clone(), - overlap, - }; - mm = Some(result); + match query_collection.sig_for_dataset(idx) { + Ok(query_sig) => { + let prefix = query_sig.name(); + let location = Utf8Path::new(&prefix).file_name().unwrap(); + for sketch in query_sig.iter() { + // Access query MinHash + if let Sketch::MinHash(query) = sketch { + let matchlist: BinaryHeap = sketchlist + .par_iter() + .filter_map(|sm| { + let mut mm = None; + // Access against MinHash + if let Some(sketch) = sm.sketches().get(0) { + if let Sketch::MinHash(against_sketch) = sketch { + if let Ok(overlap) = + against_sketch.count_common(&query, true) + { + if overlap >= threshold_hashes { + let result = PrefetchResult { + name: sm.name(), + md5sum: sm.md5sum().clone(), + minhash: against_sketch.clone(), + overlap, + }; + mm = Some(result); + } } } } - } - mm - }) - .collect(); - if !matchlist.is_empty() { - let prefetch_output = format!("{}.prefetch.csv", location); - let gather_output = format!("{}.gather.csv", location); - - // Save initial list of matches to prefetch output - write_prefetch(&query_sig, Some(prefetch_output), &matchlist).ok(); - - // Now, do the gather! - consume_query_by_gather(query_sig.clone(), matchlist, threshold_hashes, Some(gather_output)).ok(); + mm + }) + .collect(); + if !matchlist.is_empty() { + let prefetch_output = format!("{}.prefetch.csv", location); + let gather_output = format!("{}.gather.csv", location); + + // Save initial list of matches to prefetch output + write_prefetch(&query_sig, Some(prefetch_output), &matchlist).ok(); + + // Now, do the gather! + consume_query_by_gather( + query_sig.clone(), + matchlist, + threshold_hashes, + Some(gather_output), + ) + .ok(); + } else { + println!("No matches to '{}'", location); + } } else { - println!("No matches to '{}'", location); + eprintln!( + "WARNING: no compatible sketches in path '{}'", + record.internal_location() + ); + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } } } + Err(_) => { + eprintln!( + "WARNING: no compatible sketches in path '{}'", + record.internal_location() + ); + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); + } } - Err(_) => { - eprintln!("WARNING: no compatible sketches in path '{}'", record.internal_location()); - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); - } - } -}); + }); println!( "Processed {} queries total.", diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_multigather.py index 3f59278c..960bc68d 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_multigather.py @@ -67,8 +67,8 @@ def test_simple(runtmp, zip_against): print(os.listdir(runtmp.output(''))) - g_output = runtmp.output('SRR606249.sig.gz.gather.csv') - p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv') + g_output = runtmp.output('SRR606249.gather.csv') + p_output = runtmp.output('SRR606249.prefetch.csv') assert os.path.exists(p_output) # check prefetch output (only non-indexed gather) @@ -79,6 +79,7 @@ def test_simple(runtmp, zip_against): assert os.path.exists(g_output) df = pandas.read_csv(g_output) + print(df) assert len(df) == 3 keys = set(df.keys()) assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} @@ -109,9 +110,8 @@ def test_simple_zip_query(runtmp): print(os.listdir(runtmp.output(''))) - # outputs are based on md5sum, e.g. "{md5}.sig.gz.gather.csv" - g_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.gather.csv') - p_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.prefetch.csv') + g_output = runtmp.output('SRR606249.gather.csv') + p_output = runtmp.output('SRR606249.prefetch.csv') # check prefetch output (only non-indexed gather) assert os.path.exists(p_output) @@ -294,10 +294,7 @@ def test_nomatch_query(runtmp, capfd, indexed, zip_query): captured = capfd.readouterr() print(captured.err) - if zip_query: - assert "WARNING: no compatible sketches in path " not in captured.err - else: - assert "WARNING: no compatible sketches in path " in captured.err + # assert "WARNING: no compatible sketches in path " in captured.err assert "WARNING: skipped 1 query paths - no compatible signatures." in captured.err @@ -324,7 +321,7 @@ def test_missing_against(runtmp, capfd, zip_against): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err def test_bad_against(runtmp, capfd): @@ -341,7 +338,7 @@ def test_bad_against(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid line in fromfile ' in captured.err + assert 'Error: invalid line in fromfile' in captured.err def test_bad_against_2(runtmp, capfd): @@ -390,7 +387,7 @@ def test_bad_against_3(runtmp, capfd, zip_query): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + assert 'InvalidArchive' in captured.err def test_empty_against(runtmp, capfd): @@ -409,7 +406,7 @@ def test_empty_against(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert "Loaded 0 search signature(s)" in captured.err + assert "Sketch loading error: No such file or directory" in captured.err assert "Error: No search signatures loaded, exiting." in captured.err @@ -465,11 +462,8 @@ def test_md5(runtmp, zip_query): print(os.listdir(runtmp.output(''))) - g_output = runtmp.output('SRR606249.sig.gz.gather.csv') - p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv') - if zip_query: - g_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.gather.csv') - p_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.prefetch.csv') + g_output = runtmp.output('SRR606249.gather.csv') + p_output = runtmp.output('SRR606249.prefetch.csv') # check prefetch output (only non-indexed gather) assert os.path.exists(p_output) @@ -560,11 +554,8 @@ def test_csv_columns_vs_sourmash_prefetch(runtmp, zip_query, zip_against): finally: os.chdir(cwd) - g_output = runtmp.output('SRR606249.sig.gz.gather.csv') - p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv') - if zip_query: - g_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.gather.csv') - p_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.prefetch.csv') + g_output = runtmp.output('SRR606249.gather.csv') + p_output = runtmp.output('SRR606249.prefetch.csv') assert os.path.exists(p_output) assert os.path.exists(g_output) @@ -627,14 +618,14 @@ def test_simple_protein(runtmp): # test basic protein execution sigs = get_test_data('protein.zip') - sig_names = ["GCA_001593935.1_ASM159393v1_protein.faa.gz", "GCA_001593925.1_ASM159392v1_protein.faa.gz"] + sig_names = ["GCA_001593935", "GCA_001593925"] runtmp.sourmash('scripts', 'fastmultigather', sigs, sigs, '-s', '100', '--moltype', 'protein', '-k', '19') for qsig in sig_names: - g_output = runtmp.output(os.path.join(qsig + '.sig.gather.csv')) - p_output = runtmp.output(os.path.join(qsig + '.sig.prefetch.csv')) + g_output = runtmp.output(os.path.join(qsig + '.gather.csv')) + p_output = runtmp.output(os.path.join(qsig + '.prefetch.csv')) print(g_output) assert os.path.exists(g_output) assert os.path.exists(p_output) @@ -652,14 +643,14 @@ def test_simple_dayhoff(runtmp): # test basic protein execution sigs = get_test_data('dayhoff.zip') - sig_names = ["GCA_001593935.1_ASM159393v1_protein.faa.gz", "GCA_001593925.1_ASM159392v1_protein.faa.gz"] + sig_names = ["GCA_001593935", "GCA_001593925"] runtmp.sourmash('scripts', 'fastmultigather', sigs, sigs, '-s', '100', '--moltype', 'dayhoff', '-k', '19') for qsig in sig_names: - g_output = runtmp.output(os.path.join(qsig + '.sig.gather.csv')) - p_output = runtmp.output(os.path.join(qsig + '.sig.prefetch.csv')) + g_output = runtmp.output(os.path.join(qsig + '.gather.csv')) + p_output = runtmp.output(os.path.join(qsig + '.prefetch.csv')) print(g_output) assert os.path.exists(g_output) assert os.path.exists(p_output) @@ -677,14 +668,14 @@ def test_simple_hp(runtmp): # test basic protein execution sigs = get_test_data('hp.zip') - sig_names = ["GCA_001593935.1_ASM159393v1_protein.faa.gz", "GCA_001593925.1_ASM159392v1_protein.faa.gz"] + sig_names = ["GCA_001593935", "GCA_001593925"] runtmp.sourmash('scripts', 'fastmultigather', sigs, sigs, '-s', '100', '--moltype', 'hp', '-k', '19') for qsig in sig_names: - g_output = runtmp.output(os.path.join(qsig + '.sig.gather.csv')) - p_output = runtmp.output(os.path.join(qsig + '.sig.prefetch.csv')) + g_output = runtmp.output(os.path.join(qsig + '.gather.csv')) + p_output = runtmp.output(os.path.join(qsig + '.prefetch.csv')) print(g_output) assert os.path.exists(g_output) assert os.path.exists(p_output) diff --git a/src/utils.rs b/src/utils.rs index 7824113f..a6b07b02 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -4,7 +4,7 @@ use sourmash::encodings::HashFunctions; use sourmash::manifest::Manifest; use sourmash::selection::Select; -use std::fs::File; +use std::fs::{create_dir_all, File}; use std::io::Read; use std::io::{BufRead, BufReader, BufWriter, Write}; use std::panic; @@ -161,7 +161,8 @@ pub fn prefetch( .filter_map(|result| { let mut mm = None; let searchsig = &result.minhash; - let overlap = searchsig.count_common(query_mh, false); + // TODO: fix Select so we can go back to downsample: false here + let overlap = searchsig.count_common(query_mh, true); if let Ok(overlap) = overlap { if overlap >= threshold_hashes { let result = PrefetchResult { overlap, ..result }; @@ -174,18 +175,27 @@ pub fn prefetch( } /// Write list of prefetch matches. -// pub fn write_prefetch + std::fmt::Debug + std::fmt::Display + Clone>( pub fn write_prefetch( query: &SigStore, prefetch_output: Option, matchlist: &BinaryHeap, -) -> Result<()> { - // Set up a writer for prefetch output - let prefetch_out: Box = match prefetch_output { - Some(path) => Box::new(BufWriter::new(File::create(path).unwrap())), - None => Box::new(std::io::stdout()), - }; - let mut writer = BufWriter::new(prefetch_out); +) -> Result<(), Box> { + // Define the writer to stdout by default + let mut writer: Box = Box::new(std::io::stdout()); + + if let Some(output_path) = &prefetch_output { + // Account for potential missing dir in output path + let directory_path = Path::new(output_path).parent(); + + // If a directory path exists in the filename, create it if it doesn't already exist + if let Some(dir) = directory_path { + create_dir_all(dir)?; + } + + let file = File::create(output_path)?; + writer = Box::new(BufWriter::new(file)); + } + writeln!( &mut writer, "query_filename,query_name,query_md5,match_name,match_md5,intersect_bp" @@ -860,18 +870,27 @@ pub fn report_on_sketch_loading( /// Execute the gather algorithm, greedy min-set-cov, by iteratively /// removing matches in 'matchlist' from 'query'. -pub fn consume_query_by_gather + std::fmt::Debug + std::fmt::Display + Clone>( +pub fn consume_query_by_gather( query: SigStore, matchlist: BinaryHeap, threshold_hashes: u64, - gather_output: Option

, + gather_output: Option, ) -> Result<()> { - // Set up a writer for gather output - let gather_out: Box = match gather_output { - Some(path) => Box::new(BufWriter::new(File::create(path).unwrap())), - None => Box::new(std::io::stdout()), - }; - let mut writer = BufWriter::new(gather_out); + // Define the writer to stdout by default + let mut writer: Box = Box::new(std::io::stdout()); + + if let Some(output_path) = &gather_output { + // Account for potential missing dir in output path + let directory_path = Path::new(output_path).parent(); + + // If a directory path exists in the filename, create it if it doesn't already exist + if let Some(dir) = directory_path { + create_dir_all(dir)?; + } + + let file = File::create(output_path)?; + writer = Box::new(BufWriter::new(file)); + } writeln!( &mut writer, "query_filename,rank,query_name,query_md5,match_name,match_md5,intersect_bp" @@ -881,12 +900,10 @@ pub fn consume_query_by_gather + std::fmt::Debug + std::fmt::Disp let mut matching_sketches = matchlist; let mut rank = 0; - let mut last_hashes = query.size(); let mut last_matches = matching_sketches.len(); // let location = query.location; - let location = query.filename(); - // let mut query_mh = query.minhash; + let location = query.filename(); // this is different (original fasta filename) than query.location was (sig name)!! let sketches = query.sketches(); let orig_query_mh = match sketches.get(0) { @@ -894,12 +911,13 @@ pub fn consume_query_by_gather + std::fmt::Debug + std::fmt::Disp _ => Err(anyhow::anyhow!("No MinHash found")), }?; let mut query_mh = orig_query_mh.clone(); + let mut last_hashes = orig_query_mh.size(); eprintln!( "{} iter {}: start: query hashes={} matches={}", location, rank, - query.size(), + orig_query_mh.size(), matching_sketches.len() ); From 363b90d382103e7b00857d957f483980ad04b635 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 30 Jan 2024 14:58:23 -0800 Subject: [PATCH 24/47] re-allow load from sig; upd manysearch --- Cargo.lock | 2 +- Cargo.toml | 4 +- src/check.rs | 5 +- src/fastgather.rs | 85 ++++++++------- src/fastmultigather.rs | 13 ++- src/lib.rs | 23 ++--- src/manysearch.rs | 148 ++++++++++++++------------- src/mastiff_manygather.rs | 117 +++++++++++---------- src/python/tests/test_gather.py | 15 +-- src/python/tests/test_multigather.py | 65 +++++++++--- src/python/tests/test_search.py | 32 +++--- src/utils.rs | 130 +++++++++++------------ 12 files changed, 348 insertions(+), 291 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a9e84652..fe8ac5a6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1396,7 +1396,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.12.0" -source = "git+https://github.com/sourmash-bio/sourmash?rev=94b88cc314f781342721addc5ed35c531732a9b6#94b88cc314f781342721addc5ed35c531732a9b6" +source = "git+https://github.com/sourmash-bio/sourmash?rev=409aeb415ba8b04b9c09f203817d67791afa96da#409aeb415ba8b04b9c09f203817d67791afa96da" dependencies = [ "az", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 21b3976e..10b0afec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,8 +12,8 @@ crate-type = ["cdylib"] pyo3 = { version = "0.20.2", features = ["extension-module", "anyhow"] } rayon = "1.8.1" serde = { version = "1.0.195", features = ["derive"] } -sourmash = { git = "https://github.com/sourmash-bio/sourmash", rev= "94b88cc314f781342721addc5ed35c531732a9b6", features = ["branchwater"] } -#sourmash = { version = "0.12.0", features = ["branchwater"] } +sourmash = { git = "https://github.com/sourmash-bio/sourmash", rev= "409aeb415ba8b04b9c09f203817d67791afa96da", features = ["branchwater"] } +#sourmash = { version = "0.12.1", features = ["branchwater"] } serde_json = "1.0.111" niffler = "2.4.0" log = "0.4.14" diff --git a/src/check.rs b/src/check.rs index 1311318c..2995284b 100644 --- a/src/check.rs +++ b/src/check.rs @@ -4,10 +4,7 @@ use sourmash::index::revindex::{RevIndex, RevIndexOps}; pub fn check(index: camino::Utf8PathBuf, quick: bool) -> Result<(), Box> { if !is_revindex_database(&index) { - bail!( - "'{}' is not a valid RevIndex database", - index - ); + bail!("'{}' is not a valid RevIndex database", index); } println!("Opening DB"); diff --git a/src/fastgather.rs b/src/fastgather.rs index 2ff13509..5e8e3b07 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -1,20 +1,24 @@ /// fastgather: Run gather with a query against a list of files. use anyhow::Result; -use sourmash::sketch::Sketch; -use sourmash::signature::Signature; +use serde::Serialize; use sourmash::selection::Selection; -use camino; -use std::collections::BinaryHeap; +use sourmash::signature::Signature; +use sourmash::sketch::Sketch; +// use camino; use crate::utils::PrefetchResult; +use std::collections::BinaryHeap; + +use sourmash::prelude::Select; use crate::utils::{ - consume_query_by_gather, load_sketches_above_threshold, write_prefetch, ReportType, load_collection + consume_query_by_gather, load_collection, load_sketches_above_threshold, write_prefetch, + ReportType, }; pub fn fastgather( - query_filepath: camino::Utf8PathBuf, - against_filepath: camino::Utf8PathBuf, + query_filepath: &camino::Utf8PathBuf, + against_filepath: &camino::Utf8PathBuf, threshold_bp: usize, ksize: u8, scaled: usize, @@ -22,42 +26,45 @@ pub fn fastgather( gather_output: Option, prefetch_output: Option, ) -> Result<()> { + let query_collection = load_collection(query_filepath, selection, ReportType::Query)?; + let mut query_sig = None; + let mut query_mh = None; - let query_collection = load_collection(&query_filepath, selection, ReportType::Query)?; + for (idx, record) in query_collection.iter() { + if let Ok(sig) = query_collection + .sig_for_dataset(idx) + .unwrap() + .select(&selection) + { + query_sig = Some(sig.clone()); - if query_collection.len() > 1 { - bail!("Found more than one compatible sketch from '{}'. Fastgather requires a single query sketch.", &query_filepath) - } - // load query sig into memory - let mut query_mh = None; - let mut query_sig = None; - for (idx, _record) in query_collection.iter() { - // Load query sig - match query_collection.sig_for_dataset(idx) { - Ok(query_sig) => { - for sketch in query_sig.iter() { - // Access query MinHash - if let Sketch::MinHash(query) = sketch { - query_mh = Some(query.clone()); - break; - } + for sketch in sig.iter() { + // Access query MinHash + if let Sketch::MinHash(mh) = sketch { + query_mh = Some(mh.clone()); + // eprintln!("mh mins: {:?}", mh.mins()); } } - Err(_) => { - bail!("No query sketch matching selection parameters.") // should not get here bc we already check this during collection loading? - } + } else { + eprintln!("Failed to load 'query sig: {}", record.name()); } + } + if query_mh.is_none() { + bail!("No query sketch matching selection parameters."); + } - if query_mh.is_some() { - break; // Exit the loop if we found a MinHash sketch - } + if query_collection.len() != 1 { + bail!( + "Fastgather requires a single query sketch. Check input: '{:?}'", + &query_filepath + ) } // build the list of paths to match against. eprintln!("Loading matchlist from '{}'", against_filepath); - let against_collection = load_collection(&against_filepath, selection, ReportType::Against)?; + let against_collection = load_collection(against_filepath, selection, ReportType::Against)?; eprintln!("Loaded {} sig paths in matchlist", against_collection.len()); - + // calculate the minimum number of hashes based on desired threshold let threshold_hashes: u64 = { let x = threshold_bp / scaled; @@ -74,8 +81,8 @@ pub fn fastgather( threshold_hashes, threshold_bp ); - // load a set of sketches, filtering for those with overlaps > threshold - let result = load_sketches_above_threshold( + // load a set of sketches, filtering for those with overlaps > threshold + let result = load_sketches_above_threshold( against_collection, &selection, &query_mh.unwrap(), @@ -107,6 +114,12 @@ pub fn fastgather( } // run the gather! - consume_query_by_gather(query_sig.clone().unwrap(), matchlist, threshold_hashes, gather_output).ok(); + consume_query_by_gather( + query_sig.clone().unwrap(), + matchlist, + threshold_hashes, + gather_output, + ) + .ok(); Ok(()) -} \ No newline at end of file +} diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 71a0e174..d7537c8a 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -3,6 +3,7 @@ use anyhow::Result; use rayon::prelude::*; use serde::Serialize; +use sourmash::prelude::Select; use sourmash::selection::Selection; use sourmash::sketch::Sketch; use sourmash::storage::SigStore; @@ -16,8 +17,7 @@ use std::collections::BinaryHeap; use camino::{Utf8Path, Utf8PathBuf}; use crate::utils::{ - consume_query_by_gather, load_collection, load_sigpaths_from_zip_or_pathlist, - load_sketches_from_zip_or_pathlist, prepare_query, write_prefetch, PrefetchResult, ReportType, + consume_query_by_gather, load_collection, write_prefetch, PrefetchResult, ReportType, }; pub fn fastmultigather( @@ -50,7 +50,11 @@ pub fn fastmultigather( let mut sketchlist: Vec = vec![]; for (idx, record) in against_collection.iter() { - if let Ok(sig) = against_collection.sig_for_dataset(idx) { + if let Ok(sig) = against_collection.sig_for_dataset(idx) + // .unwrap() + // .select(&selection) // if we select here, we downsample and the md5sum changes! + // ...which means we would lose the original md5sum that is used in the standard gather results. + { sketchlist.push(sig); } else { eprintln!("Failed to load 'against' record: {}", record.name()); @@ -74,13 +78,14 @@ pub fn fastmultigather( // Access query MinHash if let Sketch::MinHash(query) = sketch { let matchlist: BinaryHeap = sketchlist - .par_iter() + .iter() .filter_map(|sm| { let mut mm = None; // Access against MinHash if let Some(sketch) = sm.sketches().get(0) { if let Sketch::MinHash(against_sketch) = sketch { if let Ok(overlap) = + // downsample here to just get downsampled mh and avoid changing md5sum against_sketch.count_common(&query, true) { if overlap >= threshold_hashes { diff --git a/src/lib.rs b/src/lib.rs index 18f8e9de..5e55e1f3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,8 +6,8 @@ use sourmash::selection; extern crate simple_error; mod utils; -use crate::utils::{build_template, build_selection}; use crate::utils::is_revindex_database; +use crate::utils::{build_selection, build_template}; mod check; mod fastgather; mod fastmultigather; @@ -33,15 +33,14 @@ fn do_manysearch( moltype: String, output_path: Option, ) -> anyhow::Result { - let queryfile_path: camino::Utf8PathBuf = querylist_path.clone().into(); let againstfile_path: camino::Utf8PathBuf = siglist_path.clone().into(); let selection = build_selection(ksize, scaled, &moltype); + eprintln!("selection scaled: {:?}", selection.scaled()); // if siglist_path is revindex, run mastiff_manysearch; otherwise run manysearch - let template = build_template(ksize, scaled, &moltype); if is_revindex_database(&againstfile_path) { - // if is_revindex_database(siglist_path.as_ref()) { + // if is_revindex_database(siglist_path.as_ref()) { match mastiff_manysearch::mastiff_manysearch( queryfile_path, againstfile_path, @@ -57,9 +56,9 @@ fn do_manysearch( } } else { match manysearch::manysearch( - querylist_path, - siglist_path, - template, + &queryfile_path, + &againstfile_path, + &selection, threshold, output_path, ) { @@ -85,11 +84,12 @@ fn do_fastgather( ) -> anyhow::Result { let queryfile_path: camino::Utf8PathBuf = query_filename.into(); let againstfile_path: camino::Utf8PathBuf = siglist_path.into(); + let selection = build_selection(ksize, scaled, &moltype); - + match fastgather::fastgather( - queryfile_path, - againstfile_path, + &queryfile_path, + &againstfile_path, threshold_bp, ksize, scaled, @@ -115,11 +115,10 @@ fn do_fastmultigather( moltype: String, output_path: Option, ) -> anyhow::Result { - let queryfile_path: camino::Utf8PathBuf = query_filenames.into(); let againstfile_path: camino::Utf8PathBuf = siglist_path.into(); let selection = build_selection(ksize, scaled, &moltype); - + // if a siglist path is a revindex, run mastiff_manygather. If not, run multigather if is_revindex_database(&againstfile_path) { match mastiff_manygather::mastiff_manygather( diff --git a/src/manysearch.rs b/src/manysearch.rs index a95f8d69..53f25e3c 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -6,44 +6,47 @@ use anyhow::Result; use rayon::prelude::*; -use sourmash::signature::{Signature, SigsTrait}; +use sourmash::prelude::Select; +use sourmash::selection::Selection; +use sourmash::signature::SigsTrait; use sourmash::sketch::Sketch; +use sourmash::storage::SigStore; use std::path::Path; use std::sync::atomic; use std::sync::atomic::AtomicUsize; -use crate::utils::{ - csvwriter_thread, load_sigpaths_from_zip_or_pathlist, load_sketches_from_zip_or_pathlist, - prepare_query, ReportType, SearchResult, -}; +use crate::utils::{csvwriter_thread, load_collection, ReportType, SearchResult}; pub fn manysearch>( - querylist: P, - siglist: P, - template: Sketch, + query_filepath: &camino::Utf8PathBuf, + against_filepath: &camino::Utf8PathBuf, + selection: &Selection, threshold: f64, output: Option

, ) -> Result<()> { // Read in list of query paths. - eprintln!( - "Reading list of queries from: '{}'", - querylist.as_ref().display() - ); - - // Load all queries into memory at once. - let queries = load_sketches_from_zip_or_pathlist(querylist, &template, ReportType::Query)?; - - // Load all _paths_, not signatures, into memory. - let siglist_name = siglist.as_ref().to_string_lossy().to_string(); - let (search_sigs_paths, _temp_dir) = - load_sigpaths_from_zip_or_pathlist(siglist, &template, ReportType::Against)?; - - if search_sigs_paths.is_empty() { - bail!("No signatures to search loaded, exiting."); + eprintln!("Reading queries from: '{}'", query_filepath); + + // Load all query sigs into memory at once. + let query_collection = load_collection(query_filepath, selection, ReportType::Query)?; + // load actual signatures + let mut query_sketchlist: Vec = vec![]; + + for (idx, record) in query_collection.iter() { + if let Ok(sig) = query_collection + .sig_for_dataset(idx) + .unwrap() + .select(&selection) + { + query_sketchlist.push(sig); + } else { + eprintln!("Failed to load 'query' sig: {}", record.name()); + } } - eprintln!("Loaded {} sig paths to search.", search_sigs_paths.len()); + // Load all _paths_, not signatures, into memory. + let against_collection = load_collection(against_filepath, selection, ReportType::Against)?; // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel::(rayon::current_num_threads()); @@ -61,9 +64,9 @@ pub fn manysearch>( let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - let send = search_sigs_paths + let send = against_collection .par_iter() - .filter_map(|filename| { + .filter_map(|(idx, record)| { let i = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst); if i % 1000 == 0 { eprintln!("Processed {} search sigs", i); @@ -71,60 +74,65 @@ pub fn manysearch>( let mut results = vec![]; - // load search signature from path: - match Signature::from_path(filename) { - Ok(search_sigs) => { - let location = filename.display().to_string(); - if let Some(search_sm) = prepare_query(&search_sigs, &template, &location) { - // search for matches & save containment. - for q in queries.iter() { - let overlap = - q.minhash.count_common(&search_sm.minhash, false).unwrap() as f64; - let query_size = q.minhash.size() as f64; - let target_size = search_sm.minhash.size() as f64; - - let containment_query_in_target = overlap / query_size; - let containment_in_target = overlap / target_size; - let max_containment = - containment_query_in_target.max(containment_in_target); - let jaccard = overlap / (target_size + query_size - overlap); - - if containment_query_in_target > threshold { - results.push(SearchResult { - query_name: q.name.clone(), - query_md5: q.md5sum.clone(), - match_name: search_sm.name.clone(), - containment: containment_query_in_target, - intersect_hashes: overlap as usize, - match_md5: Some(search_sm.md5sum.clone()), - jaccard: Some(jaccard), - max_containment: Some(max_containment), - }); + match against_collection.sig_for_dataset(idx) { + Ok(against_sig) => match against_sig.select(selection) { + Ok(against_sig) => { + for sketch in against_sig.iter() { + if let Sketch::MinHash(against_mh) = sketch { + for query_sig in query_sketchlist.iter() { + for sketch in query_sig.iter() { + if let Sketch::MinHash(query_mh) = sketch { + let overlap = + query_mh.count_common(&against_mh, false).unwrap() + as f64; + let query_size = query_mh.size() as f64; + let target_size = against_mh.size() as f64; + + let containment_query_in_target = overlap / query_size; + let containment_in_target = overlap / target_size; + let max_containment = containment_query_in_target + .max(containment_in_target); + let jaccard = + overlap / (target_size + query_size - overlap); + + if containment_query_in_target > threshold { + results.push(SearchResult { + query_name: query_sig.name(), + query_md5: query_mh.md5sum(), + match_name: against_sig.name(), + containment: containment_query_in_target, + intersect_hashes: overlap as usize, + match_md5: Some(against_mh.md5sum()), + jaccard: Some(jaccard), + max_containment: Some(max_containment), + }); + } + } + } + } } } - } else { - // for reading zips, this is likely not a useful warning and - // would show up too often (every sig is stored as individual file). - if !siglist_name.ends_with(".zip") { - eprintln!( - "WARNING: no compatible sketches in path '{}'", - filename.display() - ); - } + } + Err(err) => { + eprintln!("Sketch selection error: {}", err); + eprintln!( + "WARNING: no compatible sketches in path '{}'", + record.internal_location() + ); let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - Some(results) - } + }, Err(err) => { - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); eprintln!("Sketch loading error: {}", err); eprintln!( - "WARNING: could not load sketches from path '{}'", - filename.display() + "WARNING: no compatible sketches in path '{}'", + record.internal_location() ); - None + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } } + + Some(results) }) .flatten() .try_for_each_with(send, |s, m| s.send(m)); diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 19da5728..2175e759 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -21,7 +21,6 @@ use std::io::{BufWriter, Write}; use crate::utils::{is_revindex_database, load_collection, ReportType}; - pub fn mastiff_manygather>( queries_file: camino::Utf8PathBuf, index: camino::Utf8PathBuf, @@ -30,10 +29,7 @@ pub fn mastiff_manygather>( output: Option

, ) -> Result<(), Box> { if !is_revindex_database(&index) { - bail!( - "'{}' is not a valid RevIndex database", - index - ); + bail!("'{}' is not a valid RevIndex database", index); } // Open database once let db = RevIndex::open(index, true)?; @@ -77,67 +73,70 @@ pub fn mastiff_manygather>( let failed_paths = AtomicUsize::new(0); let send = query_collection - .par_iter() - .filter_map(|(idx, record)| { - let threshold = threshold_bp / selection.scaled()? as usize; - - match query_collection.sig_for_dataset(idx) { - // match query_collection.sig_from_record(record) { // to be added in core - Ok(query_sig) => { - let mut results = vec![]; - let mut found_compatible_sketch = false; - for sketch in query_sig.iter() { - if let Sketch::MinHash(query) = sketch { - found_compatible_sketch = true; - // Gather! - let (counter, query_colors, hash_to_color) = - db.prepare_gather_counters(&query); - - let matches = db.gather( - counter, - query_colors, - hash_to_color, - threshold, - &query, - Some(selection.clone()), - ); - // extract results - if let Ok(matches) = matches { - for match_ in &matches { - results.push(( - query_sig.name().clone(), - query.md5sum().clone(), - match_.name().clone(), - match_.md5().clone(), - match_.f_match(), // f_match_query - match_.intersect_bp(), - )); // intersect_bp + .par_iter() + .filter_map(|(idx, record)| { + let threshold = threshold_bp / selection.scaled()? as usize; + + match query_collection.sig_for_dataset(idx) { + // match query_collection.sig_from_record(record) { // to be added in core + Ok(query_sig) => { + let mut results = vec![]; + let mut found_compatible_sketch = false; + for sketch in query_sig.iter() { + if let Sketch::MinHash(query) = sketch { + found_compatible_sketch = true; + // Gather! + let (counter, query_colors, hash_to_color) = + db.prepare_gather_counters(&query); + + let matches = db.gather( + counter, + query_colors, + hash_to_color, + threshold, + &query, + Some(selection.clone()), + ); + // extract results + if let Ok(matches) = matches { + for match_ in &matches { + results.push(( + query_sig.name().clone(), + query.md5sum().clone(), + match_.name().clone(), + match_.md5().clone(), + match_.f_match(), // f_match_query + match_.intersect_bp(), + )); // intersect_bp + } + } else { + eprintln!("Error gathering matches: {:?}", matches.err()); } - } else { - eprintln!("Error gathering matches: {:?}", matches.err()); } } - } - if !found_compatible_sketch { - eprintln!("WARNING: no compatible sketches in path '{}'", query_sig.filename()); - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); - } + if !found_compatible_sketch { + eprintln!( + "WARNING: no compatible sketches in path '{}'", + query_sig.filename() + ); + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); + } - if results.is_empty() { + if results.is_empty() { + None + } else { + Some(results) + } + } + Err(err) => { + eprintln!("Error loading sketch: {}", err); + let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); None - } else { - Some(results) } } - Err(err) => { - eprintln!("Error loading sketch: {}", err); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - None - } - } - }) - .flatten() - .try_for_each_with(send, |s, m| s.send(m)); + }) + .flatten() + .try_for_each_with(send, |s, m| s.send(m)); // do some cleanup and error handling - if let Err(e) = send { diff --git a/src/python/tests/test_gather.py b/src/python/tests/test_gather.py index 2b59ea2b..d4649a63 100644 --- a/src/python/tests/test_gather.py +++ b/src/python/tests/test_gather.py @@ -120,7 +120,8 @@ def test_missing_query(runtmp, capfd, zip_against): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err + @pytest.mark.parametrize('zip_against', [False, True]) def test_bad_query(runtmp, capfd, zip_against): @@ -132,9 +133,9 @@ def test_bad_query(runtmp, capfd, zip_against): sig47 = get_test_data('47.fa.sig.gz') sig63 = get_test_data('63.fa.sig.gz') - # since 'query' needs to be a sig, this breaks it. - make_file_list(query, [sig2]) - + # query doesn't need to be a sig anymore - sig, zip, or pathlist welcome + # as long as there's only one sketch that matches params + make_file_list(query, [sig2,sig47]) # [sig2] make_file_list(against_list, [sig2, sig47, sig63]) if zip_against: @@ -151,7 +152,7 @@ def test_bad_query(runtmp, capfd, zip_against): captured = capfd.readouterr() print(captured.err) - assert 'Error: expected value at line 1' in captured.err + assert 'Error: Fastgather requires a single query sketch. Check input:' in captured.err @pytest.mark.parametrize('zip_against', [False, True]) @@ -179,7 +180,7 @@ def test_missing_against(runtmp, capfd, zip_against): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err def test_bad_against(runtmp, capfd): @@ -199,7 +200,7 @@ def test_bad_against(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid line in fromfile ' in captured.err + assert 'Error: invalid line in fromfile' in captured.err def test_bad_against_2(runtmp, capfd): diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_multigather.py index 960bc68d..7ec636ba 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_multigather.py @@ -188,8 +188,10 @@ def test_missing_querylist(runtmp, capfd, indexed, zip_query): @pytest.mark.parametrize('indexed', [False, True]) -def test_bad_query(runtmp, capfd, indexed): - # test bad querylist (a sig file) +def test_sig_query(runtmp, capfd, indexed): + # sig file is now fine as a query + query = get_test_data('SRR606249.sig.gz') + against_list = runtmp.output('against.txt') sig2 = get_test_data('2.fa.sig.gz') @@ -200,19 +202,37 @@ def test_bad_query(runtmp, capfd, indexed): if indexed: against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + g_output = runtmp.output('out.csv') + else: + g_output = runtmp.output('SRR606249.gather.csv') + p_output = runtmp.output('SRR606249.prefetch.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastmultigather', sig2, against_list, - '-s', '100000') + runtmp.sourmash('scripts', 'fastmultigather', query, against_list, + '-s', '100000', '-o', g_output) captured = capfd.readouterr() print(captured.err) + if not indexed: + # check prefetch output (only non-indexed gather) + assert os.path.exists(p_output) + df = pandas.read_csv(p_output) + assert len(df) == 3 + keys = set(df.keys()) + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} - assert 'Error: invalid line in fromfile' in captured.err + # check gather output (both) + assert os.path.exists(g_output) + df = pandas.read_csv(g_output) + assert len(df) == 3 + keys = set(df.keys()) + if indexed: + assert keys == {'query_name', 'query_md5', 'match_name', 'match_md5', 'f_match_query', 'intersect_bp'} + else: + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} @pytest.mark.parametrize('indexed', [False, True]) -def test_bad_query_2(runtmp, capfd, indexed): +def test_bad_query(runtmp, capfd, indexed): # test with a bad query (a .sig.gz file renamed as zip file) against_list = runtmp.output('against.txt') @@ -324,24 +344,37 @@ def test_missing_against(runtmp, capfd, zip_against): assert 'Error: No such file or directory' in captured.err -def test_bad_against(runtmp, capfd): - # test bad 'against' file - in this case, use a .sig.gz file. +def test_sig_against(runtmp, capfd): + # against file can be a sig now query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') sig2 = get_test_data('2.fa.sig.gz') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastmultigather', query, sig2, + g_output = runtmp.output('SRR606249.gather.csv') + p_output = runtmp.output('SRR606249.prefetch.csv') + runtmp.sourmash('scripts', 'fastmultigather', query, sig2, '-s', '100000') captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid line in fromfile' in captured.err + # check prefetch output (only non-indexed gather) + assert os.path.exists(p_output) + df = pandas.read_csv(p_output) + assert len(df) == 1 + keys = set(df.keys()) + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} + + # check gather output + assert os.path.exists(g_output) + df = pandas.read_csv(g_output) + assert len(df) == 1 + keys = set(df.keys()) + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} -def test_bad_against_2(runtmp, capfd): +def test_bad_against(runtmp, capfd): # test bad 'against' file - in this case, one containing a nonexistent file query = get_test_data('SRR606249.sig.gz') query_list = runtmp.output('query.txt') @@ -363,8 +396,8 @@ def test_bad_against_2(runtmp, capfd): @pytest.mark.parametrize('zip_query', [False, True]) -def test_bad_against_3(runtmp, capfd, zip_query): - # test with a bad query (a .sig.gz file renamed as zip file) +def test_bad_against_2(runtmp, capfd, zip_query): + # test with a bad against (a .sig.gz file renamed as zip file) query = get_test_data('SRR606249.sig.gz') query_list = runtmp.output('query.txt') make_file_list(query_list, [query]) @@ -382,7 +415,7 @@ def test_bad_against_3(runtmp, capfd, zip_query): with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'fastmultigather', query_list, against_zip, - '-o', output) + '-s', '100000', '-o', output) captured = capfd.readouterr() print(captured.err) diff --git a/src/python/tests/test_search.py b/src/python/tests/test_search.py index 6f29ec9b..8af6bf3f 100644 --- a/src/python/tests/test_search.py +++ b/src/python/tests/test_search.py @@ -251,8 +251,8 @@ def test_missing_query(runtmp, capfd, indexed, zip_query): @pytest.mark.parametrize("indexed", [False, True]) -def test_bad_query(runtmp, capfd, indexed): - # test with a bad query (a .sig.gz file) +def test_sig_query(runtmp, capfd, indexed): + # test with a single sig query (a .sig.gz file) against_list = runtmp.output('against.txt') sig2 = get_test_data('2.fa.sig.gz') @@ -266,14 +266,14 @@ def test_bad_query(runtmp, capfd, indexed): output = runtmp.output('out.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysearch', sig2, against_list, + # with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'manysearch', sig2, against_list, '-o', output) - captured = capfd.readouterr() - print(captured.err) + # captured = capfd.readouterr() + # print(captured.err) - assert 'Error: invalid line in fromfile' in captured.err + # assert 'Error: invalid line in fromfile' in captured.err @pytest.mark.parametrize("indexed", [False, True]) @@ -352,34 +352,34 @@ def test_missing_against(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err -def test_bad_against(runtmp, capfd): - # test with a bad against list (a .sig file in this case) +def test_nomatch_against(runtmp, capfd): + # nonmatching against file (num sig) query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') sig2 = get_test_data('2.fa.sig.gz') sig47 = get_test_data('47.fa.sig.gz') sig63 = get_test_data('63.fa.sig.gz') + # nomatch_sketch = get_test_data('genome-s11.fa.gz.sig') + nomatch_sketch = get_test_data('SRR606249.sig.gz') make_file_list(query_list, [sig2, sig47, sig63]) - #make_file_list(against_list, [sig2, sig47, sig63]) + make_file_list(against_list, [nomatch_sketch]) output = runtmp.output('out.csv') with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysearch', query_list, sig2, + runtmp.sourmash('scripts', 'manysearch', query_list, against_list, '-o', output) captured = capfd.readouterr() - print(captured.err) + assert "No search signatures loaded, exiting." in captured.err - assert 'Error: invalid line in fromfile ' in captured.err - -def test_bad_against_2(runtmp, capfd): +def test_bad_against(runtmp, capfd): # test with a bad against list (a missing file) query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') diff --git a/src/utils.rs b/src/utils.rs index a6b07b02..be99bab6 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -693,62 +693,81 @@ pub fn load_collection( } let mut n_failed = 0; - let collection = if sigpath.extension().map_or(false, |ext| ext == "zip") { + let mut collection = if sigpath.extension().map_or(false, |ext| ext == "zip") { match Collection::from_zipfile(&sigpath) { Ok(collection) => collection, - Err(_) => { - bail!("failed to load {} zipfile: '{}'", report_type, sigpath); - } + Err(_) => bail!("failed to load {} zipfile: '{}'", report_type, sigpath), } } else { - let sketchlist_file = BufReader::new(File::open(sigpath)?); - - let records: Vec = sketchlist_file - .lines() - .filter_map(|line| { - let path = match line { - Ok(path) => path, - Err(err) => { - eprintln!("Error: invalid line in fromfile"); - return None; // Skip - } - }; - - match Signature::from_path(&path) { - Ok(signatures) => { - let recs: Vec = signatures - .into_iter() - .flat_map(|v| Record::from_sig(&v, path.as_str())) - .collect(); - Some(recs) - } - Err(err) => { - eprintln!("Sketch loading error: {}", err); - eprintln!("WARNING: could not load sketches from path '{}'", path); - n_failed += 1; - None - } + // if pathlist is just a signature path, load it into a collection + match Signature::from_path(sigpath) { + Ok(signatures) => { + // Load the collection from the signature + match Collection::from_sigs(signatures) { + Ok(collection) => collection, + Err(_) => bail!( + "loaded {} signatures but failed to load as collection: '{}'", + report_type, + sigpath + ), } - }) - .flatten() - .collect(); - - let manifest: Manifest = records.into(); - - Collection::new( - manifest, - InnerStorage::new( - FSStorage::builder() - .fullpath("".into()) - .subdir("".into()) - .build(), - ), - ) + } + // if not, try to load file as list of sig paths + Err(_) => { + // // using core fn doesn't allow us to ignore failed paths; I reimplement loading here to allow + let sketchlist_file = BufReader::new(File::open(sigpath)?); + let records: Vec = sketchlist_file + .lines() + .filter_map(|line| { + let path = line.ok()?; + match Signature::from_path(&path) { + Ok(signatures) => { + let recs: Vec = signatures + .into_iter() + .flat_map(|v| Record::from_sig(&v, &path)) + .collect(); + Some(recs) + } + Err(err) => { + eprintln!("Sketch loading error: {}", err); + eprintln!("WARNING: could not load sketches from path '{}'", path); + n_failed += 1; + None + } + } + }) + .flatten() + .collect(); + + let manifest: Manifest = records.into(); + Collection::new( + manifest, + InnerStorage::new( + FSStorage::builder() + .fullpath("".into()) + .subdir("".into()) + .build(), + ), + ) + } + } }; let n_total = collection.len(); - let selected = collection.select(&selection)?; + eprintln!("n_total: {}", n_total); + // collection = collection.select(selection)?; + let selected = collection.select(selection)?; + + if selected.len() == 1 { + let sig = selected.sig_for_dataset(0).unwrap(); + eprintln!("sig name: {:?}", sig.name()); + let mh = sig.minhash().unwrap(); + eprintln!("scaled= {:?}", mh.scaled()) + } + + eprintln!("selection_len: {}", selected.len()); let n_skipped = n_total - selected.len(); + // let n_skipped = n_total - collection.len(); report_on_collection_loading(&selected, n_skipped, n_failed, report_type)?; Ok(selected) } @@ -780,23 +799,6 @@ pub fn report_on_collection_loading( Ok(()) } -pub fn load_single_sig_from_collection( - query_collection: &Collection, // Replace with the actual type - selection: &Selection, -) -> Result { - let scaled = selection.scaled().unwrap(); - let ksize = selection.ksize().unwrap(); - - match query_collection.sig_for_dataset(0) { - Ok(sig) => Ok(sig), - Err(_) => Err(anyhow::anyhow!( - "No sketch found with scaled={}, k={}", - scaled, - ksize - )), - } -} - // pub fn load_single_sketch_from_sig<'a>(sig: &'a SigStore, selection: &'a Selection) -> Result<&'a KmerMinHash> { // let sketch = sig.sketches().get(0).ok_or_else(|| { // anyhow::anyhow!("No sketch found with scaled={}, k={}", selection.scaled().unwrap_or_default(), selection.ksize().unwrap_or_default()) From 0ea39b5b856ebda777fdb45e1b6ac6b322c0ffc7 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 30 Jan 2024 16:21:38 -0800 Subject: [PATCH 25/47] fix all except moltype selection --- src/fastgather.rs | 54 +++++++++++++-------------------- src/python/tests/test_gather.py | 23 ++++++++------ src/utils.rs | 23 ++++++++------ 3 files changed, 49 insertions(+), 51 deletions(-) diff --git a/src/fastgather.rs b/src/fastgather.rs index 5e8e3b07..8680abaa 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -10,6 +10,7 @@ use crate::utils::PrefetchResult; use std::collections::BinaryHeap; use sourmash::prelude::Select; +use sourmash::signature::SigsTrait; use crate::utils::{ consume_query_by_gather, load_collection, load_sketches_above_threshold, write_prefetch, @@ -27,31 +28,6 @@ pub fn fastgather( prefetch_output: Option, ) -> Result<()> { let query_collection = load_collection(query_filepath, selection, ReportType::Query)?; - let mut query_sig = None; - let mut query_mh = None; - - for (idx, record) in query_collection.iter() { - if let Ok(sig) = query_collection - .sig_for_dataset(idx) - .unwrap() - .select(&selection) - { - query_sig = Some(sig.clone()); - - for sketch in sig.iter() { - // Access query MinHash - if let Sketch::MinHash(mh) = sketch { - query_mh = Some(mh.clone()); - // eprintln!("mh mins: {:?}", mh.mins()); - } - } - } else { - eprintln!("Failed to load 'query sig: {}", record.name()); - } - } - if query_mh.is_none() { - bail!("No query sketch matching selection parameters."); - } if query_collection.len() != 1 { bail!( @@ -59,6 +35,22 @@ pub fn fastgather( &query_filepath ) } + // get single query sig and minhash + let query_sig = query_collection.sig_for_dataset(0)?; // need original md5sum, etc + // downsample + let query_sig_ds = query_sig.clone().select(selection)?; + let query_mh = match query_sig_ds.minhash() { + Some(query_mh) => query_mh, + None => { + bail!("No query sketch matching selection parameters."); + } + }; + // some debugging prints + // eprintln!("selection scaled: {:?}", selection.scaled()); + // eprintln!("selection ksize: {:?}", selection.ksize()); + // eprintln!("query ksize: {:?}", query_mh.ksize()); + // eprintln!("selection moltype: {:?}", selection.moltype()); + // eprintln!("query moltype: {:?}", query_sig.hash_function()); // build the list of paths to match against. eprintln!("Loading matchlist from '{}'", against_filepath); @@ -82,12 +74,8 @@ pub fn fastgather( ); // load a set of sketches, filtering for those with overlaps > threshold - let result = load_sketches_above_threshold( - against_collection, - &selection, - &query_mh.unwrap(), - threshold_hashes, - )?; + let result = + load_sketches_above_threshold(against_collection, &selection, &query_mh, threshold_hashes)?; let matchlist = result.0; let skipped_paths = result.1; let failed_paths = result.2; @@ -110,12 +98,12 @@ pub fn fastgather( } if prefetch_output.is_some() { - write_prefetch(query_sig.as_ref().unwrap(), prefetch_output, &matchlist).ok(); + write_prefetch(&query_sig, prefetch_output, &matchlist).ok(); } // run the gather! consume_query_by_gather( - query_sig.clone().unwrap(), + query_sig.clone(), matchlist, threshold_hashes, gather_output, diff --git a/src/python/tests/test_gather.py b/src/python/tests/test_gather.py index d4649a63..e56602b3 100644 --- a/src/python/tests/test_gather.py +++ b/src/python/tests/test_gather.py @@ -183,8 +183,8 @@ def test_missing_against(runtmp, capfd, zip_against): assert 'Error: No such file or directory' in captured.err -def test_bad_against(runtmp, capfd): - # test bad 'against' file - in this case, use a .sig.gz file. +def test_sig_against(runtmp, capfd): + # sig file is ok as against file now query = get_test_data('SRR606249.sig.gz') sig2 = get_test_data('2.fa.sig.gz') @@ -192,18 +192,23 @@ def test_bad_against(runtmp, capfd): g_output = runtmp.output('gather.csv') p_output = runtmp.output('prefetch.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastgather', query, sig2, + runtmp.sourmash('scripts', 'fastgather', query, sig2, '-o', g_output, '--output-prefetch', p_output, '-s', '100000') captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid line in fromfile' in captured.err + assert os.path.exists(g_output) + + df = pandas.read_csv(g_output) + assert len(df) == 1 + keys = set(df.keys()) + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} + -def test_bad_against_2(runtmp, capfd): +def test_bad_against(runtmp, capfd): # test bad 'against' file - in this case, one containing a bad filename. query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') @@ -226,7 +231,7 @@ def test_bad_against_2(runtmp, capfd): assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err -def test_bad_against_3(runtmp, capfd): +def test_bad_against_2(runtmp, capfd): # test bad 'against' file - in this case, one containing an empty file query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') @@ -254,7 +259,7 @@ def test_bad_against_3(runtmp, capfd): assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err -def test_bad_against_4(runtmp, capfd): +def test_bad_against_3(runtmp, capfd): # test with a bad against (a .sig.gz file renamed as zip file) query = get_test_data('SRR606249.sig.gz') @@ -276,7 +281,7 @@ def test_bad_against_4(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + assert 'InvalidArchive' in captured.err @pytest.mark.parametrize('zip_against', [False, True]) diff --git a/src/utils.rs b/src/utils.rs index be99bab6..bd1161b3 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -473,7 +473,8 @@ pub fn load_sketches_above_threshold( if let Ok(against_sig) = against_collection.sig_for_dataset(idx) { if let Some(sketch) = against_sig.sketches().get(0) { if let Sketch::MinHash(against_mh) = sketch { - if let Ok(overlap) = against_mh.count_common(query, false) { + // currently downsampling here to avoid changing md5sum + if let Ok(overlap) = against_mh.count_common(query, true) { if overlap >= threshold_hashes { let result = PrefetchResult { name: against_sig.name().to_string(), @@ -987,16 +988,20 @@ pub fn build_template(ksize: u8, scaled: usize, moltype: &str) -> Sketch { } pub fn build_selection(ksize: u8, scaled: usize, moltype: &str) -> Selection { - let hash_function = match moltype { - "dna" => HashFunctions::Murmur64Dna, - "protein" => HashFunctions::Murmur64Protein, - "dayhoff" => HashFunctions::Murmur64Dayhoff, - "hp" => HashFunctions::Murmur64Hp, - _ => panic!("Unknown molecule type: {}", moltype), - }; + // let hash_function = match moltype { + // "dna" => HashFunctions::Murmur64Dna, + // "protein" => HashFunctions::Murmur64Protein, + // "dayhoff" => HashFunctions::Murmur64Dayhoff, + // "hp" => HashFunctions::Murmur64Hp, + // _ => panic!("Unknown molecule type: {}", moltype), + // }; + let hash_function = HashFunctions::try_from(moltype) + .map_err(|_| panic!("Unknown molecule type: {}", moltype)) + .unwrap(); + let adjusted_ksize = if moltype == "dna" { ksize } else { ksize * 3 }; Selection::builder() - .ksize(ksize.into()) + .ksize(adjusted_ksize.into()) .scaled(scaled as u32) .moltype(hash_function) .build() From 912f717724d37ae17fc35912f0dbd76d20dd444a Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 30 Jan 2024 17:45:54 -0800 Subject: [PATCH 26/47] update fastgather and multisearch --- src/fastgather.rs | 14 -- src/lib.rs | 12 +- src/multisearch.rs | 114 +++++++++----- src/python/tests/test_gather.py | 20 ++- src/python/tests/test_multisearch.py | 36 ++--- src/python/tests/test_search.py | 2 +- src/utils.rs | 223 +++------------------------ 7 files changed, 137 insertions(+), 284 deletions(-) diff --git a/src/fastgather.rs b/src/fastgather.rs index 8680abaa..82362c85 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -1,13 +1,8 @@ /// fastgather: Run gather with a query against a list of files. use anyhow::Result; -use serde::Serialize; use sourmash::selection::Selection; -use sourmash::signature::Signature; -use sourmash::sketch::Sketch; // use camino; -use crate::utils::PrefetchResult; -use std::collections::BinaryHeap; use sourmash::prelude::Select; use sourmash::signature::SigsTrait; @@ -45,17 +40,8 @@ pub fn fastgather( bail!("No query sketch matching selection parameters."); } }; - // some debugging prints - // eprintln!("selection scaled: {:?}", selection.scaled()); - // eprintln!("selection ksize: {:?}", selection.ksize()); - // eprintln!("query ksize: {:?}", query_mh.ksize()); - // eprintln!("selection moltype: {:?}", selection.moltype()); - // eprintln!("query moltype: {:?}", query_sig.hash_function()); - // build the list of paths to match against. - eprintln!("Loading matchlist from '{}'", against_filepath); let against_collection = load_collection(against_filepath, selection, ReportType::Against)?; - eprintln!("Loaded {} sig paths in matchlist", against_collection.len()); // calculate the minimum number of hashes based on desired threshold let threshold_hashes: u64 = { diff --git a/src/lib.rs b/src/lib.rs index 5e55e1f3..acdc7b61 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,5 @@ /// Python interface Rust code for sourmash_plugin_branchwater. use pyo3::prelude::*; -use sourmash::selection; #[macro_use] extern crate simple_error; @@ -18,8 +17,6 @@ mod mastiff_manygather; mod mastiff_manysearch; mod multisearch; mod pairwise; -use sourmash::encodings::HashFunctions; -use sourmash::selection::Selection; use camino::Utf8PathBuf; @@ -212,13 +209,16 @@ fn do_multisearch( moltype: String, output_path: Option, ) -> anyhow::Result { + let queryfile_path: camino::Utf8PathBuf = querylist_path.into(); + let againstfile_path: camino::Utf8PathBuf = siglist_path.into(); + let selection = build_selection(ksize, scaled, &moltype); // let selection = build_selection(ksize, scaled, &moltype); let template = build_template(ksize, scaled, &moltype); match multisearch::multisearch( - querylist_path, - siglist_path, + &queryfile_path, + &againstfile_path, threshold, - template, + &selection, output_path, ) { Ok(_) => Ok(0), diff --git a/src/multisearch.rs b/src/multisearch.rs index 73fe9437..be9989f6 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -9,28 +9,59 @@ use std::path::Path; use std::sync::atomic; use std::sync::atomic::AtomicUsize; +use sourmash::prelude::Select; +use sourmash::selection::Selection; use sourmash::signature::SigsTrait; use sourmash::sketch::Sketch; +use sourmash::storage::SigStore; -use crate::utils::{load_sketches_from_zip_or_pathlist, ReportType}; +use crate::utils::{load_collection, ReportType}; /// Search many queries against a list of signatures. /// /// Note: this function loads all _queries_ into memory, and iterates over /// database once. -pub fn multisearch>( - querylist: P, - againstlist: P, +pub fn multisearch( + query_filepath: &camino::Utf8PathBuf, + against_filepath: &camino::Utf8PathBuf, threshold: f64, - template: Sketch, - output: Option

, + selection: &Selection, + output: Option, ) -> Result<(), Box> { // Load all queries into memory at once. - let queries = load_sketches_from_zip_or_pathlist(&querylist, &template, ReportType::Query)?; + + // let queries = load_sketches_from_zip_or_pathlist(&querylist, &template, ReportType::Query)?; + let query_collection = load_collection(query_filepath, selection, ReportType::Query)?; + let mut queries: Vec = vec![]; + for (idx, record) in query_collection.iter() { + if let Ok(sig) = query_collection.sig_from_record(record) + // .unwrap() + // .select(&selection) // if we select here, we downsample and the md5sum changes! + // ...which means we would lose the original md5sum that is used in the standard gather results. + { + queries.push(sig); + } else { + eprintln!("Failed to load 'against' record: {}", record.name()); + } + } // Load all against sketches into memory at once. - let against = load_sketches_from_zip_or_pathlist(&againstlist, &template, ReportType::Against)?; + // let against = load_sketches_from_zip_or_pathlist(&againstlist, &template, ReportType::Against)?; + let against_collection = load_collection(against_filepath, selection, ReportType::Against)?; + let mut against: Vec = vec![]; + + for (idx, record) in against_collection.iter() { + if let Ok(sig) = against_collection.sig_from_record(record) + // .unwrap() + // .select(&selection) // if we select here, we downsample and the md5sum changes! + // ...which means we would lose the original md5sum that is used in the standard gather results. + { + against.push(sig); + } else { + eprintln!("Failed to load 'against' record: {}", record.name()); + } + } // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); @@ -66,39 +97,46 @@ pub fn multisearch>( .filter_map(|target| { let mut results = vec![]; - // search for matches & save containment. - for q in queries.iter() { - let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); - if i % 100000 == 0 { - eprintln!("Processed {} comparisons", i); + let ds_against_sig = target.clone().select(&selection).unwrap(); + if let Some(against_mh) = ds_against_sig.minhash() { + // search for matches & save containment. + for query_sig in queries.iter() { + let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); + if i % 100000 == 0 { + eprintln!("Processed {} comparisons", i); + } + let ds_q = query_sig.clone().select(&selection).unwrap(); + let query_mh = ds_q.minhash()?; + let overlap = query_mh.count_common(&against_mh, false).unwrap() as f64; + // use downsampled sizes + let query_size = query_mh.size() as f64; + let target_size = against_mh.size() as f64; + + let containment_query_in_target = overlap / query_size; + let containment_in_target = overlap / target_size; + let max_containment = containment_query_in_target.max(containment_in_target); + let jaccard = overlap / (target_size + query_size - overlap); + + if containment_query_in_target > threshold { + results.push(( + query_sig.name(), + query_sig.md5sum(), + target.name(), + target.md5sum(), + containment_query_in_target, + max_containment, + jaccard, + overlap, + )) + } } - - let overlap = q.minhash.count_common(&target.minhash, false).unwrap() as f64; - let query_size = q.minhash.size() as f64; - let target_size = target.minhash.size() as f64; - - let containment_query_in_target = overlap / query_size; - let containment_in_target = overlap / target_size; - let max_containment = containment_query_in_target.max(containment_in_target); - let jaccard = overlap / (target_size + query_size - overlap); - - if containment_query_in_target > threshold { - results.push(( - q.name.clone(), - q.md5sum.clone(), - target.name.clone(), - target.md5sum.clone(), - containment_query_in_target, - max_containment, - jaccard, - overlap, - )) + if results.is_empty() { + None + } else { + Some(results) } - } - if results.is_empty() { - None } else { - Some(results) + None } }) .flatten() diff --git a/src/python/tests/test_gather.py b/src/python/tests/test_gather.py index e56602b3..d0376a02 100644 --- a/src/python/tests/test_gather.py +++ b/src/python/tests/test_gather.py @@ -310,13 +310,15 @@ def test_against_multisigfile(runtmp, zip_against): df = pandas.read_csv(g_output) if zip_against: assert len(df) == 3 + print(df) else: + print(df) assert len(df) == 1 # @CTB this is a bug :(. It should load multiple sketches properly! @pytest.mark.parametrize('zip_against', [False, True]) -def test_query_multisigfile(runtmp, zip_against): +def test_query_multisigfile(runtmp, capfd, zip_against): # test with a sigfile that contains multiple sketches against_list = runtmp.output('against.txt') @@ -335,12 +337,14 @@ def test_query_multisigfile(runtmp, zip_against): g_output = runtmp.output('gather.csv') p_output = runtmp.output('prefetch.csv') - runtmp.sourmash('scripts', 'fastgather', combined, against_list, + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'fastgather', combined, against_list, '-o', g_output, '--output-prefetch', p_output, '-s', '100000') - # @CTB this should fail, not succeed :(. - df = pandas.read_csv(g_output) - assert len(df) == 1 + # this fails now :) + captured = capfd.readouterr() + print(captured.err) + assert "Error: Fastgather requires a single query sketch. Check input:" in captured.err @pytest.mark.parametrize('zip_against', [False, True]) @@ -555,7 +559,7 @@ def test_simple_protein(runtmp): # test basic protein execution sigs = get_test_data('protein.zip') - query = runtmp.output('query.sig') + query = runtmp.output('query.zip') against = runtmp.output('against.zip') # extract query from zip file runtmp.sourmash('sig', 'extract', sigs, '--name', 'GCA_001593935', '-o', query) @@ -582,7 +586,7 @@ def test_simple_dayhoff(runtmp): # test basic protein execution sigs = get_test_data('dayhoff.zip') - query = runtmp.output('query.sig') + query = runtmp.output('query.zip') against = runtmp.output('against.zip') # extract query from zip file runtmp.sourmash('sig', 'extract', sigs, '--name', 'GCA_001593935', '-o', query) @@ -609,7 +613,7 @@ def test_simple_hp(runtmp): # test basic protein execution sigs = get_test_data('hp.zip') - query = runtmp.output('query.sig') + query = runtmp.output('query.zip') against = runtmp.output('against.zip') # extract query from zip file runtmp.sourmash('sig', 'extract', sigs, '--name', 'GCA_001593935', '-o', query) diff --git a/src/python/tests/test_multisearch.py b/src/python/tests/test_multisearch.py index ef2ea222..ff2136b0 100644 --- a/src/python/tests/test_multisearch.py +++ b/src/python/tests/test_multisearch.py @@ -148,11 +148,11 @@ def test_missing_query(runtmp, capfd, zip_query): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err -def test_bad_query(runtmp, capfd): - # test with a bad query (a .sig.gz file) +def test_sig_query(runtmp, capfd): + # sig is ok as query now against_list = runtmp.output('against.txt') sig2 = get_test_data('2.fa.sig.gz') @@ -163,17 +163,17 @@ def test_bad_query(runtmp, capfd): output = runtmp.output('out.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'multisearch', sig2, against_list, + runtmp.sourmash('scripts', 'multisearch', sig2, against_list, '-o', output) captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid line in fromfile ' in captured.err - + assert os.path.exists(output) + df = pandas.read_csv(output) + assert len(df) == 1 -def test_bad_query_2(runtmp, capfd): +def test_bad_query(runtmp, capfd): # test with a bad query list (a missing file) query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -221,7 +221,7 @@ def test_bad_query_3(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + assert 'InvalidArchive' in captured.err @pytest.mark.parametrize("zip_db", [False, True]) @@ -250,11 +250,11 @@ def test_missing_against(runtmp, capfd, zip_db): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err -def test_bad_against(runtmp, capfd): - # test with a bad against list (a .sig file in this case) +def test_sig_against(runtmp, capfd): + # against can be sig now query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -267,17 +267,17 @@ def test_bad_against(runtmp, capfd): output = runtmp.output('out.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'multisearch', query_list, sig2, + runtmp.sourmash('scripts', 'multisearch', query_list, sig2, '-o', output) captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid line in fromfile ' in captured.err - + assert os.path.exists(output) + df = pandas.read_csv(output) + assert len(df) == 1 -def test_bad_against_2(runtmp, capfd): +def test_bad_against(runtmp, capfd): # test with a bad against list (a missing file) query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -380,7 +380,7 @@ def test_load_only_one_bug(runtmp, capfd, zip_db): print(captured.err) assert not 'WARNING: skipped 1 paths - no compatible signatures.' in captured.err - assert not 'WARNING: no compatible sketches in path ' in captured.err + assert not 'WARNING: no compatible sketches in path' in captured.err @pytest.mark.parametrize("zip_query", [False, True]) diff --git a/src/python/tests/test_search.py b/src/python/tests/test_search.py index 8af6bf3f..2ab45907 100644 --- a/src/python/tests/test_search.py +++ b/src/python/tests/test_search.py @@ -327,7 +327,7 @@ def test_bad_query_3(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + assert 'InvalidArchive' in captured.err @pytest.mark.parametrize("indexed", [False, True]) diff --git a/src/utils.rs b/src/utils.rs index bd1161b3..0b69eb00 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -245,102 +245,6 @@ pub fn load_sketchlist_filenames>(sketchlist_filename: &P) -> Res Ok(sketchlist_filenames) } -/// Loads signature file paths from a ZIP archive. -/// -/// This function extracts the contents of a ZIP archive containing -/// signature files (with extensions ".sig" or ".sig.gz") to a temporary directory. -/// It returns the paths of these extracted signature files. -/// -/// # Arguments -/// -/// * `zip_path` - The path to the ZIP archive. -/// -/// # Returns -/// -/// Returns a tuple containing: -/// * A vector of `PathBuf` representing the paths to the extracted signature files. -/// * The `TempDir` representing the temporary directory where the files were extracted. -/// Since tempfile::TempDir creates a temporary directory that is automatically -/// deleted once the TempDir value goes out of scope, we return it here to move it -/// to the main function scope. -/// -/// # Errors -/// -/// Returns an error if: -/// * Unable to create a temporary directory. -/// * Unable to open or read the ZIP archive. -/// * Any other IO or file related error. -pub fn load_sigpaths_from_zip>( - zip_path: P, - template: &Sketch, - report_type: ReportType, -) -> Result<(Vec, tempfile::TempDir)> { - let mut signature_paths = Vec::new(); - let temp_dir = tempdir()?; - let zip_file = File::open(&zip_path)?; - let mut zip_archive = ZipArchive::new(zip_file)?; - - let mut skipped_paths = 0; - for i in 0..zip_archive.len() { - let mut file = zip_archive.by_index(i)?; - // make string copy to avoid file borrowing issues - let file_name_str = file.name().to_string(); - let file_name = Path::new(&file_name_str) - .file_name() - .unwrap() - .to_str() - .unwrap(); - // use contains to account for sig.gz_0 bug in sourmash - if file_name.contains(".sig") || file_name.contains(".sig.gz") { - // read file - let mut contents = Vec::new(); - file.read_to_end(&mut contents)?; - // get sig from file - let sigs = Signature::from_reader(&contents[..])?; - if sigs.len() > 1 { - return Err(anyhow::anyhow!( - "File '{}' has more than one signature.", - file_name - )); - } - let sig = &sigs[0]; // Directly take the first (only) signature - // check for compatible sketch - let is_compatible = if let Some(Sketch::MinHash(_)) = sig.select_sketch(template) { - true - } else if let Sketch::MinHash(template_mh) = template { - sig.sketches().iter().any(|sketch| { - matches!(sketch, Sketch::MinHash(ref_mh) if check_compatible_downsample(&ref_mh, template_mh).is_ok()) - }) - } else { - false - }; - - if is_compatible { - let path = temp_dir.path().join(file_name); - // write contents to new file - let mut new_file = File::create(&path)?; - new_file.write_all(&contents)?; - // add filepath to signature paths - signature_paths.push(path); - } else { - skipped_paths += 1; - } - } - } - if skipped_paths > 0 { - eprintln!( - "WARNING: skipped {} {} paths - no compatible signatures.", - skipped_paths, report_type - ); - } - eprintln!( - "loaded paths for {} signature files from zipfile {}", - signature_paths.len(), - zip_path.as_ref().display() - ); - Ok((signature_paths, temp_dir)) -} - pub fn load_fasta_fromfile>( sketchlist_filename: &P, ) -> Result> { @@ -467,22 +371,22 @@ pub fn load_sketches_above_threshold( let matchlist: BinaryHeap = against_collection .par_iter() - .filter_map(|(idx, against_record)| { - let mut mm = None; + .filter_map(|(_idx, against_record)| { + let mut results = Vec::new(); // Load against into memory - if let Ok(against_sig) = against_collection.sig_for_dataset(idx) { - if let Some(sketch) = against_sig.sketches().get(0) { + if let Ok(against_sig) = against_collection.sig_from_record(against_record) { + for sketch in against_sig.sketches() { if let Sketch::MinHash(against_mh) = sketch { // currently downsampling here to avoid changing md5sum if let Ok(overlap) = against_mh.count_common(query, true) { if overlap >= threshold_hashes { let result = PrefetchResult { - name: against_sig.name().to_string(), - md5sum: against_mh.md5sum().to_string(), + name: against_record.name().to_string(), + md5sum: against_mh.md5sum(), minhash: against_mh.clone(), overlap, }; - mm = Some(result); + results.push(result); } } } else { @@ -492,12 +396,6 @@ pub fn load_sketches_above_threshold( ); let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - } else { - eprintln!( - "WARNING: no compatible sketches in path '{}'", - against_sig.filename() - ); - let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } } else { // this shouldn't happen here anymore -- likely would happen at load_collection @@ -507,8 +405,13 @@ pub fn load_sketches_above_threshold( ); let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - mm + if results.is_empty() { + None + } else { + Some(results) + } }) + .flatten() .collect(); let skipped_paths = skipped_paths.load(atomic::Ordering::SeqCst); @@ -580,50 +483,6 @@ pub fn load_sketches_from_zip>( Ok((sketchlist, skipped_paths, failed_paths)) } -/// Control function to read signature FILE PATHS from an input file. -/// If a ZIP archive is provided (detected via extension), -/// use `load_sigpaths_from_zip`. Otherwise, assume the -/// user provided a `fromfile` sketchlist and use -/// `load_sketchlist_filenames`. -/// -/// # Arguments -/// -/// * `sketchlist_path` - Path to either a ZIP archive or a list of signature file paths. -/// -/// # Returns -/// -/// Returns a tuple containing: -/// * A vector of `PathBuf` representing the signature file paths. -/// * If extracting from a zipfile, signature files will be extracted to a -/// `TempDir` temporary directory where they can be used individually. -pub fn load_sigpaths_from_zip_or_pathlist>( - sketchlist_path: P, - template: &Sketch, - report_type: ReportType, -) -> Result<(Vec, Option)> { - eprintln!( - "Reading list of filepaths from: '{}'", - sketchlist_path.as_ref().display() - ); - - let result = if sketchlist_path - .as_ref() - .extension() - .map(|ext| ext == "zip") - .unwrap_or(false) - { - let (paths, tempdir) = load_sigpaths_from_zip(&sketchlist_path, template, report_type)?; - (paths, Some(tempdir)) - } else { - let paths = load_sketchlist_filenames(&sketchlist_path)?; - (paths, None) - }; - - eprintln!("Found {} filepaths", result.0.len()); - // should we bail here if empty? - Ok(result) -} - pub enum ReportType { Query, Against, @@ -755,20 +614,8 @@ pub fn load_collection( }; let n_total = collection.len(); - eprintln!("n_total: {}", n_total); - // collection = collection.select(selection)?; let selected = collection.select(selection)?; - - if selected.len() == 1 { - let sig = selected.sig_for_dataset(0).unwrap(); - eprintln!("sig name: {:?}", sig.name()); - let mh = sig.minhash().unwrap(); - eprintln!("scaled= {:?}", mh.scaled()) - } - - eprintln!("selection_len: {}", selected.len()); let n_skipped = n_total - selected.len(); - // let n_skipped = n_total - collection.len(); report_on_collection_loading(&selected, n_skipped, n_failed, report_type)?; Ok(selected) } @@ -800,27 +647,6 @@ pub fn report_on_collection_loading( Ok(()) } -// pub fn load_single_sketch_from_sig<'a>(sig: &'a SigStore, selection: &'a Selection) -> Result<&'a KmerMinHash> { -// let sketch = sig.sketches().get(0).ok_or_else(|| { -// anyhow::anyhow!("No sketch found with scaled={}, k={}", selection.scaled().unwrap_or_default(), selection.ksize().unwrap_or_default()) -// })?; - -// if let Sketch::MinHash(mh) = sketch { -// Ok(mh) -// } else { -// Err(anyhow::anyhow!("No sketch found with scaled={}, k={}", selection.scaled().unwrap_or_default(), selection.ksize().unwrap_or_default())) -// } -// } - -// pub fn load_single_sig_and_sketch<'a>( -// query_collection: &'a Collection, -// selection: &'a Selection, -// ) -> Result<(SigStore, &'a KmerMinHash)> { -// let sig = load_single_sig_from_collection(query_collection, selection)?; -// let sketch = load_single_sketch_from_sig(&sig, selection)?; -// Ok((sig, sketch)) -// } - /// Uses the output of sketch loading functions to report the /// total number of sketches loaded, as well as the number of files, /// if any, that failed to load or contained no compatible sketches. @@ -988,20 +814,19 @@ pub fn build_template(ksize: u8, scaled: usize, moltype: &str) -> Sketch { } pub fn build_selection(ksize: u8, scaled: usize, moltype: &str) -> Selection { - // let hash_function = match moltype { - // "dna" => HashFunctions::Murmur64Dna, - // "protein" => HashFunctions::Murmur64Protein, - // "dayhoff" => HashFunctions::Murmur64Dayhoff, - // "hp" => HashFunctions::Murmur64Hp, - // _ => panic!("Unknown molecule type: {}", moltype), - // }; - let hash_function = HashFunctions::try_from(moltype) - .map_err(|_| panic!("Unknown molecule type: {}", moltype)) - .unwrap(); - let adjusted_ksize = if moltype == "dna" { ksize } else { ksize * 3 }; + let hash_function = match moltype { + "dna" => HashFunctions::Murmur64Dna, + "protein" => HashFunctions::Murmur64Protein, + "dayhoff" => HashFunctions::Murmur64Dayhoff, + "hp" => HashFunctions::Murmur64Hp, + _ => panic!("Unknown molecule type: {}", moltype), + }; + // let hash_function = HashFunctions::try_from(moltype) + // .map_err(|_| panic!("Unknown molecule type: {}", moltype)) + // .unwrap(); Selection::builder() - .ksize(adjusted_ksize.into()) + .ksize(ksize.into()) .scaled(scaled as u32) .moltype(hash_function) .build() From f5216f820b955c62a685e60c56611601f84afbf8 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 30 Jan 2024 18:38:55 -0800 Subject: [PATCH 27/47] update pairwise --- src/lib.rs | 5 +- src/pairwise.rs | 93 ++++++++++++++++++++----------- src/python/tests/test_pairwise.py | 24 +++----- 3 files changed, 70 insertions(+), 52 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index acdc7b61..c7c2d69f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -238,8 +238,9 @@ fn do_pairwise( moltype: String, output_path: Option, ) -> anyhow::Result { - let template = build_template(ksize, scaled, &moltype); - match pairwise::pairwise(siglist_path, threshold, template, output_path) { + let queryfile_path: camino::Utf8PathBuf = siglist_path.into(); + let selection = build_selection(ksize, scaled, &moltype); + match pairwise::pairwise(&queryfile_path, threshold, &selection, output_path) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); diff --git a/src/pairwise.rs b/src/pairwise.rs index 6e7fe7c4..c4c0a886 100644 --- a/src/pairwise.rs +++ b/src/pairwise.rs @@ -1,6 +1,7 @@ use anyhow::Result; /// pairwise: massively parallel in-memory pairwise comparisons. use rayon::prelude::*; +use sourmash::sketch::minhash::KmerMinHash; use std::fs::File; use std::io::{BufWriter, Write}; @@ -12,20 +13,41 @@ use std::sync::atomic::AtomicUsize; use sourmash::signature::SigsTrait; use sourmash::sketch::Sketch; -use crate::utils::{load_sketches_from_zip_or_pathlist, ReportType}; +use crate::utils::{load_collection, ReportType}; +use sourmash::prelude::Select; +use sourmash::selection::Selection; +use sourmash::storage::SigStore; /// Perform pairwise comparisons of all signatures in a list. /// /// Note: this function loads all _signatures_ into memory. pub fn pairwise>( - siglist: P, + sigpath: &camino::Utf8PathBuf, threshold: f64, - template: Sketch, + selection: &Selection, output: Option

, ) -> Result<(), Box> { // Load all sigs into memory at once. - let sigs = load_sketches_from_zip_or_pathlist(&siglist, &template, ReportType::Query)?; + let collection = load_collection(sigpath, selection, ReportType::Query)?; + + if collection.len() <= 1 { + bail!( + "Pairwise requires two or more sketches. Check input: '{:?}'", + &sigpath + ) + } + + let mut sketches: Vec<(KmerMinHash, String, String)> = Vec::new(); + for (_idx, record) in collection.iter() { + if let Ok(sig) = collection.sig_from_record(record) { + if let Some(ds_mh) = sig.clone().select(&selection)?.minhash().cloned() { + sketches.push((ds_mh, record.name().to_string(), record.md5().to_string())); + } + } else { + eprintln!("Failed to load record: {}", record.name()); + } + } // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); @@ -54,37 +76,40 @@ pub fn pairwise>( let processed_cmp = AtomicUsize::new(0); - sigs.par_iter().enumerate().for_each(|(i, q1)| { - for q2 in &sigs[(i + 1)..] { - let overlap = q1.minhash.count_common(&q2.minhash, false).unwrap() as f64; - let query1_size = q1.minhash.size() as f64; - let query2_size = q2.minhash.size() as f64; - - let containment_q1_in_q2 = overlap / query1_size; - let containment_q2_in_q1 = overlap / query2_size; - let max_containment = containment_q1_in_q2.max(containment_q2_in_q1); - let jaccard = overlap / (query1_size + query2_size - overlap); - - if containment_q1_in_q2 > threshold || containment_q2_in_q1 > threshold { - send.send(( - q1.name.clone(), - q1.md5sum.clone(), - q2.name.clone(), - q2.md5sum.clone(), - containment_q1_in_q2, - max_containment, - jaccard, - overlap, - )) - .unwrap(); - } - - let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); - if i % 100000 == 0 { - eprintln!("Processed {} comparisons", i); + sketches + .par_iter() + .enumerate() + .for_each(|(idx, (q1, q1_name, q1_md5))| { + for (j, (q2, q2_name, q2_md5)) in sketches.iter().enumerate().skip(idx + 1) { + let overlap = q1.count_common(q2, false).unwrap() as f64; + let query1_size = q1.size() as f64; + let query2_size = q2.size() as f64; + + let containment_q1_in_q2 = overlap / query1_size; + let containment_q2_in_q1 = overlap / query2_size; + let max_containment = containment_q1_in_q2.max(containment_q2_in_q1); + let jaccard = overlap / (query1_size + query2_size - overlap); + + if containment_q1_in_q2 > threshold || containment_q2_in_q1 > threshold { + send.send(( + q1_name.clone(), + q1_md5.clone(), + q2_name.clone(), + q2_md5.clone(), + containment_q1_in_q2, + max_containment, + jaccard, + overlap, + )) + .unwrap(); + } + + let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); + if i % 100000 == 0 { + eprintln!("Processed {} comparisons", i); + } } - } - }); + }); // do some cleanup and error handling - drop(send); // close the channel diff --git a/src/python/tests/test_pairwise.py b/src/python/tests/test_pairwise.py index 84bb2365..eeec42d4 100644 --- a/src/python/tests/test_pairwise.py +++ b/src/python/tests/test_pairwise.py @@ -115,15 +115,9 @@ def test_simple_threshold(runtmp, zip_query): -def test_bad_query(runtmp, capfd): - # test with a bad query (a .sig.gz file) - against_list = runtmp.output('against.txt') - +def test_sig_query(runtmp, capfd): + # sig query is ok now, but fails bc only one sig sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') - - make_file_list(against_list, [sig2, sig47, sig63]) output = runtmp.output('out.csv') @@ -133,18 +127,16 @@ def test_bad_query(runtmp, capfd): captured = capfd.readouterr() print(captured.err) + assert "Error: Pairwise requires two or more sketches. Check input" in captured.err - assert 'Error: invalid line in fromfile ' in captured.err - -def test_bad_query_2(runtmp, capfd): +def test_bad_query(runtmp, capfd): # test with a bad query list (a missing file) query_list = runtmp.output('query.txt') sig2 = get_test_data('2.fa.sig.gz') sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') - make_file_list(query_list, [sig2, "no-exist"]) + make_file_list(query_list, [sig2, sig47, "no-exist"]) output = runtmp.output('out.csv') @@ -160,7 +152,7 @@ def test_bad_query_2(runtmp, capfd): -def test_bad_query_3(runtmp, capfd): +def test_bad_query_2(runtmp, capfd): # test with a bad query (a .sig.gz file renamed as zip file) sig2 = get_test_data('2.fa.sig.gz') @@ -182,7 +174,7 @@ def test_bad_query_3(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + assert 'InvalidArchive' in captured.err @pytest.mark.parametrize("zip_db", [False, True]) @@ -203,7 +195,7 @@ def test_missing_query(runtmp, capfd, zip_db): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err From 893e0a7b52119d351b360df277979c2ec03d460d Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 30 Jan 2024 18:44:22 -0800 Subject: [PATCH 28/47] clean up a little --- src/lib.rs | 4 +- src/python/tests/test_pairwise.py | 2 - src/utils.rs | 106 ------------------------------ 3 files changed, 1 insertion(+), 111 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index c7c2d69f..d2365afe 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,8 +5,8 @@ use pyo3::prelude::*; extern crate simple_error; mod utils; +use crate::utils::build_selection; use crate::utils::is_revindex_database; -use crate::utils::{build_selection, build_template}; mod check; mod fastgather; mod fastmultigather; @@ -212,8 +212,6 @@ fn do_multisearch( let queryfile_path: camino::Utf8PathBuf = querylist_path.into(); let againstfile_path: camino::Utf8PathBuf = siglist_path.into(); let selection = build_selection(ksize, scaled, &moltype); - // let selection = build_selection(ksize, scaled, &moltype); - let template = build_template(ksize, scaled, &moltype); match multisearch::multisearch( &queryfile_path, &againstfile_path, diff --git a/src/python/tests/test_pairwise.py b/src/python/tests/test_pairwise.py index eeec42d4..55259e85 100644 --- a/src/python/tests/test_pairwise.py +++ b/src/python/tests/test_pairwise.py @@ -150,8 +150,6 @@ def test_bad_query(runtmp, capfd): assert "WARNING: 1 query paths failed to load. See error messages above." in captured.err - - def test_bad_query_2(runtmp, capfd): # test with a bad query (a .sig.gz file renamed as zip file) diff --git a/src/utils.rs b/src/utils.rs index 0b69eb00..eeff3ff9 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -420,69 +420,6 @@ pub fn load_sketches_above_threshold( Ok((matchlist, skipped_paths, failed_paths)) } -/// Loads all compatible sketches from a ZIP archive at the given path into memory. -/// Currently not parallelized; use a different zip crate to enable parallelization. -/// -/// # Arguments -/// -/// * `zip_path` - Path to the ZIP archive. -/// * `template` - Reference to the Sketch template. -/// -/// # Returns -/// -/// Returns a tuple containing: -/// * A vector of `SmallSignature`s. -/// * Number of paths that were skipped because they did not match the sketch parameters. -/// * Number of paths that failed to load. -/// -/// # Errors -/// -/// Returns an error if: -/// * Unable to open the ZIP file. -/// * ZIP archive is malformed. -pub fn load_sketches_from_zip>( - zip_path: P, - template: &Sketch, -) -> Result<(Vec, usize, usize)> { - let mut sketchlist = Vec::new(); - let zip_file = File::open(&zip_path)?; - let mut zip_archive = ZipArchive::new(zip_file)?; - let mut skipped_paths = 0; - let mut failed_paths = 0; - - // loop through, loading signatures - for i in 0..zip_archive.len() { - let mut file = zip_archive.by_index(i)?; - let file_name = Path::new(file.name()) - .file_name() - .unwrap() - .to_str() - .unwrap() - .to_owned(); - - if !file_name.contains(".sig") && !file_name.contains(".sig.gz") { - continue; - } - if let Ok(sigs) = Signature::from_reader(&mut file) { - if let Some(sm) = - prepare_query(&sigs, template, &zip_path.as_ref().display().to_string()) - { - sketchlist.push(sm); - } else { - // track number of paths that have no matching sigs - skipped_paths += 1; - } - } else { - // failed to load from this path - print error & track. - eprintln!("WARNING: could not load sketches from path '{}'", file_name); - failed_paths += 1; - } - } - drop(zip_archive); - println!("loaded {} signatures", sketchlist.len()); - Ok((sketchlist, skipped_paths, failed_paths)) -} - pub enum ReportType { Query, Against, @@ -500,49 +437,6 @@ impl std::fmt::Display for ReportType { } } -/// Control function to load compatible signatures from an input file. -/// If a ZIP archive is provided (detected via extension), -/// calls `load_sketches_from_zip`. Otherwise, assumes the -/// user provided a `fromfile` sketchlist and calls -/// `load_sketchlist_filenames`. -/// -/// # Arguments -/// -/// * `sketchlist_path` - Path to either a ZIP archive or a list of signature file paths. -/// * `template` - Reference to the Sketch template (used to load only compatible signatures). -/// * `report_type` - ReportType Enum. Are these 'query' or 'search' signatures? -/// -/// # Returns -/// -/// Returns a vector of `SmallSignature`s. -pub fn load_sketches_from_zip_or_pathlist>( - sketchlist_path: P, - template: &Sketch, - report_type: ReportType, -) -> Result> { - eprintln!( - "Reading list of {} paths from: '{}'", - report_type, - sketchlist_path.as_ref().display() - ); - - let (sketchlist, skipped_paths, failed_paths) = if sketchlist_path - .as_ref() - .extension() - .map(|ext| ext == "zip") - .unwrap_or(false) - { - load_sketches_from_zip(sketchlist_path, template)? - } else { - let sketch_paths = load_sketchlist_filenames(&sketchlist_path)?; - load_sketches(sketch_paths, template)? - }; - - report_on_sketch_loading(&sketchlist, skipped_paths, failed_paths, report_type)?; - - Ok(sketchlist) -} - pub fn load_collection( sigpath: &camino::Utf8PathBuf, selection: &Selection, From dbdff4a88047f4d267dd097f77fed31c204c9c21 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 1 Feb 2024 15:18:05 -0800 Subject: [PATCH 29/47] clean up; unify sketch loading for pairwise/multisearch --- src/fastgather.rs | 1 - src/fastmultigather.rs | 5 +- src/mastiff_manygather.rs | 1 - src/multisearch.rs | 108 +++++---------- src/pairwise.rs | 20 +-- src/python/tests/test_pairwise.py | 4 +- src/utils.rs | 217 +++++------------------------- 7 files changed, 73 insertions(+), 283 deletions(-) diff --git a/src/fastgather.rs b/src/fastgather.rs index 82362c85..ff4a07ea 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -5,7 +5,6 @@ use sourmash::selection::Selection; // use camino; use sourmash::prelude::Select; -use sourmash::signature::SigsTrait; use crate::utils::{ consume_query_by_gather, load_collection, load_sketches_above_threshold, write_prefetch, diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index d7537c8a..f28dcb85 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -2,19 +2,16 @@ use anyhow::Result; use rayon::prelude::*; -use serde::Serialize; -use sourmash::prelude::Select; use sourmash::selection::Selection; use sourmash::sketch::Sketch; use sourmash::storage::SigStore; -use sourmash::{selection, signature::Signature}; use std::sync::atomic; use std::sync::atomic::AtomicUsize; use std::collections::BinaryHeap; -use camino::{Utf8Path, Utf8PathBuf}; +use camino::Utf8Path; use crate::utils::{ consume_query_by_gather, load_collection, write_prefetch, PrefetchResult, ReportType, diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 2175e759..6a80a647 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -2,7 +2,6 @@ use anyhow::Result; use rayon::prelude::*; -use sourmash::signature::Signature; use sourmash::sketch::Sketch; use std::path::Path; diff --git a/src/multisearch.rs b/src/multisearch.rs index be9989f6..0d772276 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -4,18 +4,14 @@ use rayon::prelude::*; use std::fs::File; use std::io::{BufWriter, Write}; -use std::path::Path; use std::sync::atomic; use std::sync::atomic::AtomicUsize; -use sourmash::prelude::Select; use sourmash::selection::Selection; use sourmash::signature::SigsTrait; -use sourmash::sketch::Sketch; -use sourmash::storage::SigStore; -use crate::utils::{load_collection, ReportType}; +use crate::utils::{load_collection, load_mh_with_name_and_md5, ReportType}; /// Search many queries against a list of signatures. /// @@ -31,37 +27,14 @@ pub fn multisearch( ) -> Result<(), Box> { // Load all queries into memory at once. - // let queries = load_sketches_from_zip_or_pathlist(&querylist, &template, ReportType::Query)?; let query_collection = load_collection(query_filepath, selection, ReportType::Query)?; - let mut queries: Vec = vec![]; - for (idx, record) in query_collection.iter() { - if let Ok(sig) = query_collection.sig_from_record(record) - // .unwrap() - // .select(&selection) // if we select here, we downsample and the md5sum changes! - // ...which means we would lose the original md5sum that is used in the standard gather results. - { - queries.push(sig); - } else { - eprintln!("Failed to load 'against' record: {}", record.name()); - } - } + let queries = + load_mh_with_name_and_md5(query_collection, &selection, ReportType::Query).unwrap(); // Load all against sketches into memory at once. - // let against = load_sketches_from_zip_or_pathlist(&againstlist, &template, ReportType::Against)?; let against_collection = load_collection(against_filepath, selection, ReportType::Against)?; - let mut against: Vec = vec![]; - - for (idx, record) in against_collection.iter() { - if let Ok(sig) = against_collection.sig_from_record(record) - // .unwrap() - // .select(&selection) // if we select here, we downsample and the md5sum changes! - // ...which means we would lose the original md5sum that is used in the standard gather results. - { - against.push(sig); - } else { - eprintln!("Failed to load 'against' record: {}", record.name()); - } - } + let against = + load_mh_with_name_and_md5(against_collection, &selection, ReportType::Against).unwrap(); // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); @@ -94,49 +67,42 @@ pub fn multisearch( let send = against .par_iter() - .filter_map(|target| { + .filter_map(|(against_mh, against_name, against_md5)| { let mut results = vec![]; - - let ds_against_sig = target.clone().select(&selection).unwrap(); - if let Some(against_mh) = ds_against_sig.minhash() { - // search for matches & save containment. - for query_sig in queries.iter() { - let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); - if i % 100000 == 0 { - eprintln!("Processed {} comparisons", i); - } - let ds_q = query_sig.clone().select(&selection).unwrap(); - let query_mh = ds_q.minhash()?; - let overlap = query_mh.count_common(&against_mh, false).unwrap() as f64; - // use downsampled sizes - let query_size = query_mh.size() as f64; - let target_size = against_mh.size() as f64; - - let containment_query_in_target = overlap / query_size; - let containment_in_target = overlap / target_size; - let max_containment = containment_query_in_target.max(containment_in_target); - let jaccard = overlap / (target_size + query_size - overlap); - - if containment_query_in_target > threshold { - results.push(( - query_sig.name(), - query_sig.md5sum(), - target.name(), - target.md5sum(), - containment_query_in_target, - max_containment, - jaccard, - overlap, - )) - } + // search for matches & save containment. + for (query_mh, query_name, query_md5) in queries.iter() { + let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); + if i % 100000 == 0 { + eprintln!("Processed {} comparisons", i); } - if results.is_empty() { - None - } else { - Some(results) + + let overlap = query_mh.count_common(&against_mh, false).unwrap() as f64; + // use downsampled sizes + let query_size = query_mh.size() as f64; + let target_size = against_mh.size() as f64; + + let containment_query_in_target = overlap / query_size; + let containment_in_target = overlap / target_size; + let max_containment = containment_query_in_target.max(containment_in_target); + let jaccard = overlap / (target_size + query_size - overlap); + + if containment_query_in_target > threshold { + results.push(( + query_name.clone(), + query_md5.clone(), + against_name.clone(), + against_md5.clone(), + containment_query_in_target, + max_containment, + jaccard, + overlap, + )) } - } else { + } + if results.is_empty() { None + } else { + Some(results) } }) .flatten() diff --git a/src/pairwise.rs b/src/pairwise.rs index c4c0a886..b6713d41 100644 --- a/src/pairwise.rs +++ b/src/pairwise.rs @@ -1,7 +1,6 @@ use anyhow::Result; /// pairwise: massively parallel in-memory pairwise comparisons. use rayon::prelude::*; -use sourmash::sketch::minhash::KmerMinHash; use std::fs::File; use std::io::{BufWriter, Write}; @@ -11,12 +10,9 @@ use std::sync::atomic; use std::sync::atomic::AtomicUsize; use sourmash::signature::SigsTrait; -use sourmash::sketch::Sketch; -use crate::utils::{load_collection, ReportType}; -use sourmash::prelude::Select; +use crate::utils::{load_collection, load_mh_with_name_and_md5, ReportType}; use sourmash::selection::Selection; -use sourmash::storage::SigStore; /// Perform pairwise comparisons of all signatures in a list. /// @@ -29,7 +25,7 @@ pub fn pairwise>( output: Option

, ) -> Result<(), Box> { // Load all sigs into memory at once. - let collection = load_collection(sigpath, selection, ReportType::Query)?; + let collection = load_collection(sigpath, selection, ReportType::Pairwise)?; if collection.len() <= 1 { bail!( @@ -37,17 +33,7 @@ pub fn pairwise>( &sigpath ) } - - let mut sketches: Vec<(KmerMinHash, String, String)> = Vec::new(); - for (_idx, record) in collection.iter() { - if let Ok(sig) = collection.sig_from_record(record) { - if let Some(ds_mh) = sig.clone().select(&selection)?.minhash().cloned() { - sketches.push((ds_mh, record.name().to_string(), record.md5().to_string())); - } - } else { - eprintln!("Failed to load record: {}", record.name()); - } - } + let sketches = load_mh_with_name_and_md5(collection, &selection, ReportType::Pairwise).unwrap(); // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); diff --git a/src/python/tests/test_pairwise.py b/src/python/tests/test_pairwise.py index 55259e85..0dd67c05 100644 --- a/src/python/tests/test_pairwise.py +++ b/src/python/tests/test_pairwise.py @@ -147,7 +147,7 @@ def test_bad_query(runtmp, capfd): print(captured.err) assert "WARNING: could not load sketches from path 'no-exist'" in captured.err - assert "WARNING: 1 query paths failed to load. See error messages above." in captured.err + assert "WARNING: 1 signature paths failed to load. See error messages above." in captured.err def test_bad_query_2(runtmp, capfd): @@ -241,7 +241,7 @@ def test_nomatch_query(runtmp, capfd, zip_query): captured = capfd.readouterr() print(captured.err) - assert 'WARNING: skipped 1 query paths - no compatible signatures' in captured.err + assert 'WARNING: skipped 1 signature paths - no compatible signatures' in captured.err @pytest.mark.parametrize("zip_db", [False, True]) diff --git a/src/utils.rs b/src/utils.rs index eeff3ff9..5cd49de1 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -5,40 +5,26 @@ use sourmash::manifest::Manifest; use sourmash::selection::Select; use std::fs::{create_dir_all, File}; -use std::io::Read; use std::io::{BufRead, BufReader, BufWriter, Write}; use std::panic; use std::path::{Path, PathBuf}; -use tempfile::tempdir; -use zip::read::ZipArchive; - use std::sync::atomic; use std::sync::atomic::AtomicUsize; use std::collections::BinaryHeap; -use anyhow::{anyhow, Context, Result}; +use anyhow::{anyhow, Result}; use std::cmp::{Ordering, PartialOrd}; -use sourmash::collection::{self, Collection}; -use sourmash::errors::SourmashError; +use sourmash::collection::Collection; use sourmash::manifest::Record; use sourmash::selection::Selection; use sourmash::signature::{Signature, SigsTrait}; -use sourmash::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; +use sourmash::sketch::minhash::KmerMinHash; use sourmash::sketch::Sketch; use sourmash::storage::{FSStorage, InnerStorage, SigStore}; -/// Track a name/minhash. - -pub struct SmallSignature { - pub location: String, - pub name: String, - pub md5sum: String, - pub minhash: KmerMinHash, -} - /// Structure to hold overlap information from comparisons. pub struct PrefetchResult { @@ -68,86 +54,6 @@ impl PartialEq for PrefetchResult { impl Eq for PrefetchResult {} -/// check to see if two KmerMinHash are compatible. -/// -/// CTB note: despite the name, downsampling is not performed? -/// Although it checks if they are compatible in one direction... - -pub fn check_compatible_downsample( - me: &KmerMinHash, - other: &KmerMinHash, -) -> Result<(), sourmash::Error> { - /* // ignore num minhashes. - if self.num != other.num { - return Err(Error::MismatchNum { - n1: self.num, - n2: other.num, - } - .into()); - } - */ - use sourmash::Error; - - if me.ksize() != other.ksize() { - return Err(Error::MismatchKSizes); - } - if me.hash_function() != other.hash_function() { - // TODO: fix this error - return Err(Error::MismatchDNAProt); - } - if me.max_hash() < other.max_hash() { - return Err(Error::MismatchScaled); - } - if me.seed() != other.seed() { - return Err(Error::MismatchSeed); - } - Ok(()) -} - -/// Given a vec of search Signatures, each containing one or more sketches, -/// and a template Sketch, return a compatible (& now downsampled) -/// Sketch from the search Signatures.. -/// -/// CTB note: this will return the first acceptable match, I think, ignoring -/// all others. - -pub fn prepare_query( - search_sigs: &[Signature], - template: &Sketch, - location: &str, -) -> Option { - for search_sig in search_sigs.iter() { - // find exact match for template? - if let Some(Sketch::MinHash(mh)) = search_sig.select_sketch(template) { - return Some(SmallSignature { - location: location.to_string().clone(), - name: search_sig.name(), - md5sum: mh.md5sum(), - minhash: mh.clone(), - }); - } else { - // no - try to find one that can be downsampled - if let Sketch::MinHash(template_mh) = template { - for sketch in search_sig.sketches() { - if let Sketch::MinHash(ref_mh) = sketch { - if check_compatible_downsample(&ref_mh, template_mh).is_ok() { - let max_hash = max_hash_for_scaled(template_mh.scaled()); - let mh = ref_mh.downsample_max_hash(max_hash).unwrap(); - return Some(SmallSignature { - location: location.to_string().clone(), - name: search_sig.name(), - md5sum: ref_mh.md5sum(), // original - minhash: mh, // downsampled - }); - } - } - } - } - } - } - None -} - /// Find sketches in 'sketchlist' that overlap with 'query' above /// specified threshold. @@ -319,42 +225,26 @@ pub fn load_fasta_fromfile>( Ok(results) } -/// Load a collection of sketches from a file in parallel. -pub fn load_sketches( - sketchlist_paths: Vec, - template: &Sketch, -) -> Result<(Vec, usize, usize)> { - let skipped_paths = AtomicUsize::new(0); - let failed_paths = AtomicUsize::new(0); - - let sketchlist: Vec = sketchlist_paths - .par_iter() - .filter_map(|m| { - let filename = m.display().to_string(); - - match Signature::from_path(m) { - Ok(sigs) => { - let sm = prepare_query(&sigs, template, &filename); - if sm.is_none() { - // track number of paths that have no matching sigs - let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); - } - sm - } - Err(err) => { - // failed to load from this path - print error & track. - eprintln!("Sketch loading error: {}", err); - eprintln!("WARNING: could not load sketches from path '{}'", filename); - let _i = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - None - } +pub fn load_mh_with_name_and_md5( + collection: Collection, + selection: &Selection, + report_type: ReportType, +) -> Result> { + let mut sketchinfo: Vec<(KmerMinHash, String, String)> = Vec::new(); + for (_idx, record) in collection.iter() { + if let Ok(sig) = collection.sig_from_record(record) { + if let Some(ds_mh) = sig.clone().select(&selection)?.minhash().cloned() { + sketchinfo.push((ds_mh, record.name().to_string(), record.md5().to_string())); } - }) - .collect(); - - let skipped_paths = skipped_paths.load(atomic::Ordering::SeqCst); - let failed_paths = failed_paths.load(atomic::Ordering::SeqCst); - Ok((sketchlist, skipped_paths, failed_paths)) + } else { + bail!( + "Error: Failed to load {} record: {}", + report_type, + record.name() + ); + } + } + Ok(sketchinfo) } /// Load a collection of sketches from a file, filtering to keep only @@ -423,7 +313,7 @@ pub fn load_sketches_above_threshold( pub enum ReportType { Query, Against, - Index, + Pairwise, } impl std::fmt::Display for ReportType { @@ -431,7 +321,7 @@ impl std::fmt::Display for ReportType { let description = match self { ReportType::Query => "query", ReportType::Against => "search", - ReportType::Index => "index", + ReportType::Pairwise => "signature", }; write!(f, "{}", description) } @@ -447,7 +337,7 @@ pub fn load_collection( } let mut n_failed = 0; - let mut collection = if sigpath.extension().map_or(false, |ext| ext == "zip") { + let collection = if sigpath.extension().map_or(false, |ext| ext == "zip") { match Collection::from_zipfile(&sigpath) { Ok(collection) => collection, Err(_) => bail!("failed to load {} zipfile: '{}'", report_type, sigpath), @@ -514,34 +404,7 @@ pub fn load_collection( Ok(selected) } -pub fn report_on_collection_loading( - collection: &Collection, - skipped_paths: usize, - failed_paths: usize, - report_type: ReportType, -) -> Result<()> { - if failed_paths > 0 { - eprintln!( - "WARNING: {} {} paths failed to load. See error messages above.", - failed_paths, report_type - ); - } - if skipped_paths > 0 { - eprintln!( - "WARNING: skipped {} {} paths - no compatible signatures.", - skipped_paths, report_type - ); - } - - // Validate sketches - if collection.is_empty() { - bail!("No {} signatures loaded, exiting.", report_type); - } - eprintln!("Loaded {} {} signature(s)", collection.len(), report_type); - Ok(()) -} - -/// Uses the output of sketch loading functions to report the +/// Uses the output of collection loading function to report the /// total number of sketches loaded, as well as the number of files, /// if any, that failed to load or contained no compatible sketches. /// If no sketches were loaded, bail. @@ -563,8 +426,8 @@ pub fn report_on_collection_loading( /// /// Returns an error if: /// * No signatures were successfully loaded. -pub fn report_on_sketch_loading( - sketchlist: &[SmallSignature], +pub fn report_on_collection_loading( + collection: &Collection, skipped_paths: usize, failed_paths: usize, report_type: ReportType, @@ -583,10 +446,10 @@ pub fn report_on_sketch_loading( } // Validate sketches - eprintln!("Loaded {} {} signature(s)", sketchlist.len(), report_type); - if sketchlist.is_empty() { + if collection.is_empty() { bail!("No {} signatures loaded, exiting.", report_type); } + eprintln!("Loaded {} {} signature(s)", collection.len(), report_type); Ok(()) } @@ -687,26 +550,6 @@ pub fn consume_query_by_gather( Ok(()) } -pub fn build_template(ksize: u8, scaled: usize, moltype: &str) -> Sketch { - let hash_function = match moltype { - "dna" => HashFunctions::Murmur64Dna, - "protein" => HashFunctions::Murmur64Protein, - "dayhoff" => HashFunctions::Murmur64Dayhoff, - "hp" => HashFunctions::Murmur64Hp, - _ => panic!("Unknown molecule type: {}", moltype), - }; - //adjust ksize if not dna - let adjusted_ksize = if moltype == "dna" { ksize } else { ksize * 3 }; - let max_hash = max_hash_for_scaled(scaled as u64); - let template_mh = KmerMinHash::builder() - .num(0u32) - .ksize(adjusted_ksize as u32) - .max_hash(max_hash) - .hash_function(hash_function) - .build(); - Sketch::MinHash(template_mh) -} - pub fn build_selection(ksize: u8, scaled: usize, moltype: &str) -> Selection { let hash_function = match moltype { "dna" => HashFunctions::Murmur64Dna, From ab339ba1b677465bdb9954386309ff762d028d5c Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 1 Feb 2024 16:24:10 -0800 Subject: [PATCH 30/47] ...cleaner --- src/fastgather.rs | 19 +++++-- src/fastmultigather.rs | 19 +++++-- src/index.rs | 47 ++++----------- src/lib.rs | 69 +++++++++++++--------- src/manysearch.rs | 26 ++++++--- src/manysketch.rs | 12 ++-- src/mastiff_manygather.rs | 14 +++-- src/mastiff_manysearch.rs | 20 ++++--- src/multisearch.rs | 19 +++++-- src/pairwise.rs | 14 +++-- src/python/tests/test_index.py | 25 ++++---- src/utils.rs | 101 ++++++++++++++++++--------------- 12 files changed, 220 insertions(+), 165 deletions(-) diff --git a/src/fastgather.rs b/src/fastgather.rs index ff4a07ea..280afd54 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -12,16 +12,22 @@ use crate::utils::{ }; pub fn fastgather( - query_filepath: &camino::Utf8PathBuf, - against_filepath: &camino::Utf8PathBuf, + query_filepath: String, + against_filepath: String, threshold_bp: usize, ksize: u8, scaled: usize, selection: &Selection, gather_output: Option, prefetch_output: Option, + allow_failed_sigpaths: bool, ) -> Result<()> { - let query_collection = load_collection(query_filepath, selection, ReportType::Query)?; + let query_collection = load_collection( + &query_filepath, + selection, + ReportType::Query, + allow_failed_sigpaths, + )?; if query_collection.len() != 1 { bail!( @@ -40,7 +46,12 @@ pub fn fastgather( } }; // build the list of paths to match against. - let against_collection = load_collection(against_filepath, selection, ReportType::Against)?; + let against_collection = load_collection( + &against_filepath, + selection, + ReportType::Against, + allow_failed_sigpaths, + )?; // calculate the minimum number of hashes based on desired threshold let threshold_hashes: u64 = { diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index f28dcb85..6fb1c932 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -18,14 +18,20 @@ use crate::utils::{ }; pub fn fastmultigather( - query_filepath: camino::Utf8PathBuf, - against_filepath: camino::Utf8PathBuf, + query_filepath: String, + against_filepath: String, threshold_bp: usize, scaled: usize, selection: &Selection, + allow_failed_sigpaths: bool, ) -> Result<()> { // load the list of query paths - let query_collection = load_collection(&query_filepath, selection, ReportType::Query)?; + let query_collection = load_collection( + &query_filepath, + selection, + ReportType::Query, + allow_failed_sigpaths, + )?; println!("Loaded {} sig paths in querylist", query_collection.len()); let threshold_hashes: u64 = { @@ -42,7 +48,12 @@ pub fn fastmultigather( println!("threshold overlap: {} {}", threshold_hashes, threshold_bp); // Load all the against sketches - let against_collection = load_collection(&against_filepath, selection, ReportType::Against)?; + let against_collection = load_collection( + &against_filepath, + selection, + ReportType::Against, + allow_failed_sigpaths, + )?; // load actual signatures let mut sketchlist: Vec = vec![]; diff --git a/src/index.rs b/src/index.rs index 23675614..6fa7e898 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,52 +1,25 @@ -use camino::Utf8PathBuf as PathBuf; -use sourmash::collection::Collection; use sourmash::index::revindex::RevIndex; -use sourmash::manifest::Manifest; use sourmash::prelude::*; -use sourmash::storage::{FSStorage, InnerStorage, ZipStorage}; use std::path::Path; -use crate::utils::load_sketchlist_filenames; +use crate::utils::{load_collection, ReportType}; pub fn index>( - siglist: PathBuf, - manifest: Option

, - selection: Selection, + siglist: String, + selection: &Selection, output: P, save_paths: bool, colors: bool, + allow_failed_sigpaths: bool, ) -> Result<(), Box> { println!("Loading siglist"); - let manifest = if let Some(m) = manifest { - let rdr = std::fs::OpenOptions::new().read(true).open(m.as_ref())?; - Some(Manifest::from_reader(rdr)?) - } else { - None - }; - - let collection = if matches!(&siglist.extension(), Some("zip")) { - if let Some(m) = manifest { - let storage = ZipStorage::from_file(siglist)?; - Collection::new(m, InnerStorage::new(storage)) - } else { - Collection::from_zipfile(siglist)? - } - } else { - let manifest = manifest.unwrap_or_else(|| { - let sig_paths: Vec<_> = load_sketchlist_filenames(&siglist) - .unwrap_or_else(|_| panic!("Error loading siglist")) - .into_iter() - .map(|v| PathBuf::from_path_buf(v).unwrap()) - .collect(); - sig_paths.as_slice().into() - }); - let storage = FSStorage::builder() - .fullpath("".into()) - .subdir("".into()) - .build(); - Collection::new(manifest, InnerStorage::new(storage)) - }; + let collection = load_collection( + &siglist, + selection, + ReportType::General, + allow_failed_sigpaths, + )?; RevIndex::create( output.as_ref(), diff --git a/src/lib.rs b/src/lib.rs index d2365afe..ab178564 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,7 +18,7 @@ mod mastiff_manysearch; mod multisearch; mod pairwise; -use camino::Utf8PathBuf; +use camino::Utf8PathBuf as PathBuf; #[pyfunction] fn do_manysearch( @@ -30,20 +30,20 @@ fn do_manysearch( moltype: String, output_path: Option, ) -> anyhow::Result { - let queryfile_path: camino::Utf8PathBuf = querylist_path.clone().into(); - let againstfile_path: camino::Utf8PathBuf = siglist_path.clone().into(); + let againstfile_path: PathBuf = siglist_path.clone().into(); let selection = build_selection(ksize, scaled, &moltype); eprintln!("selection scaled: {:?}", selection.scaled()); + let allow_failed_sigpaths = true; // if siglist_path is revindex, run mastiff_manysearch; otherwise run manysearch if is_revindex_database(&againstfile_path) { - // if is_revindex_database(siglist_path.as_ref()) { match mastiff_manysearch::mastiff_manysearch( - queryfile_path, + querylist_path, againstfile_path, &selection, threshold, output_path, + allow_failed_sigpaths, ) { Ok(_) => Ok(0), Err(e) => { @@ -53,11 +53,12 @@ fn do_manysearch( } } else { match manysearch::manysearch( - &queryfile_path, - &againstfile_path, + querylist_path, + siglist_path, &selection, threshold, output_path, + allow_failed_sigpaths, ) { Ok(_) => Ok(0), Err(e) => { @@ -79,20 +80,19 @@ fn do_fastgather( output_path_prefetch: Option, output_path_gather: Option, ) -> anyhow::Result { - let queryfile_path: camino::Utf8PathBuf = query_filename.into(); - let againstfile_path: camino::Utf8PathBuf = siglist_path.into(); - let selection = build_selection(ksize, scaled, &moltype); + let allow_failed_sigpaths = true; match fastgather::fastgather( - &queryfile_path, - &againstfile_path, + query_filename, + siglist_path, threshold_bp, ksize, scaled, &selection, output_path_prefetch, output_path_gather, + allow_failed_sigpaths, ) { Ok(_) => Ok(0), Err(e) => { @@ -112,18 +112,19 @@ fn do_fastmultigather( moltype: String, output_path: Option, ) -> anyhow::Result { - let queryfile_path: camino::Utf8PathBuf = query_filenames.into(); - let againstfile_path: camino::Utf8PathBuf = siglist_path.into(); + let againstfile_path: camino::Utf8PathBuf = siglist_path.clone().into(); let selection = build_selection(ksize, scaled, &moltype); + let allow_failed_sigpaths = true; // if a siglist path is a revindex, run mastiff_manygather. If not, run multigather if is_revindex_database(&againstfile_path) { match mastiff_manygather::mastiff_manygather( - queryfile_path, + query_filenames, againstfile_path, &selection, threshold_bp, output_path, + allow_failed_sigpaths, ) { Ok(_) => Ok(0), Err(e) => { @@ -133,11 +134,12 @@ fn do_fastmultigather( } } else { match fastmultigather::fastmultigather( - queryfile_path, - againstfile_path, + query_filenames, + siglist_path, threshold_bp, scaled, &selection, + allow_failed_sigpaths, ) { Ok(_) => Ok(0), Err(e) => { @@ -176,9 +178,15 @@ fn do_index( colors: bool, ) -> anyhow::Result { let selection = build_selection(ksize, scaled, &moltype); - let location = camino::Utf8PathBuf::from(siglist); - let manifest = None; - match index::index(location, manifest, selection, output, save_paths, colors) { + let allow_failed_sigpaths = false; + match index::index( + siglist, + &selection, + output, + save_paths, + colors, + allow_failed_sigpaths, + ) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); @@ -189,7 +197,7 @@ fn do_index( #[pyfunction] fn do_check(index: String, quick: bool) -> anyhow::Result { - let idx: camino::Utf8PathBuf = index.into(); + let idx: PathBuf = index.into(); match check::check(idx, quick) { Ok(_) => Ok(0), Err(e) => { @@ -209,15 +217,16 @@ fn do_multisearch( moltype: String, output_path: Option, ) -> anyhow::Result { - let queryfile_path: camino::Utf8PathBuf = querylist_path.into(); - let againstfile_path: camino::Utf8PathBuf = siglist_path.into(); let selection = build_selection(ksize, scaled, &moltype); + let allow_failed_sigpaths = true; + match multisearch::multisearch( - &queryfile_path, - &againstfile_path, + querylist_path, + siglist_path, threshold, &selection, output_path, + allow_failed_sigpaths, ) { Ok(_) => Ok(0), Err(e) => { @@ -236,9 +245,15 @@ fn do_pairwise( moltype: String, output_path: Option, ) -> anyhow::Result { - let queryfile_path: camino::Utf8PathBuf = siglist_path.into(); let selection = build_selection(ksize, scaled, &moltype); - match pairwise::pairwise(&queryfile_path, threshold, &selection, output_path) { + let allow_failed_sigpaths = true; + match pairwise::pairwise( + siglist_path, + threshold, + &selection, + output_path, + allow_failed_sigpaths, + ) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); diff --git a/src/manysearch.rs b/src/manysearch.rs index 53f25e3c..fa7c4db8 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -11,25 +11,30 @@ use sourmash::selection::Selection; use sourmash::signature::SigsTrait; use sourmash::sketch::Sketch; use sourmash::storage::SigStore; -use std::path::Path; use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{csvwriter_thread, load_collection, ReportType, SearchResult}; -pub fn manysearch>( - query_filepath: &camino::Utf8PathBuf, - against_filepath: &camino::Utf8PathBuf, +pub fn manysearch( + query_filepath: String, + against_filepath: String, selection: &Selection, threshold: f64, - output: Option

, + output: Option, + allow_failed_sigpaths: bool, ) -> Result<()> { // Read in list of query paths. eprintln!("Reading queries from: '{}'", query_filepath); // Load all query sigs into memory at once. - let query_collection = load_collection(query_filepath, selection, ReportType::Query)?; + let query_collection = load_collection( + &query_filepath, + selection, + ReportType::Query, + allow_failed_sigpaths, + )?; // load actual signatures let mut query_sketchlist: Vec = vec![]; @@ -46,13 +51,18 @@ pub fn manysearch>( } // Load all _paths_, not signatures, into memory. - let against_collection = load_collection(against_filepath, selection, ReportType::Against)?; + let against_collection = load_collection( + &against_filepath, + selection, + ReportType::Against, + allow_failed_sigpaths, + )?; // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel::(rayon::current_num_threads()); // & spawn a thread that is dedicated to printing to a buffered output - let thrd = csvwriter_thread(recv, output.as_ref()); + let thrd = csvwriter_thread(recv, output); // // Main loop: iterate (in parallel) over all search signature paths, diff --git a/src/manysketch.rs b/src/manysketch.rs index 67ff25ae..1fbe399d 100644 --- a/src/manysketch.rs +++ b/src/manysketch.rs @@ -3,10 +3,10 @@ use anyhow::{anyhow, Result}; use rayon::prelude::*; use crate::utils::{load_fasta_fromfile, sigwriter, Params, ZipMessage}; +use camino::Utf8Path as Path; use needletail::parse_fastx_file; use sourmash::cmd::ComputeParameters; use sourmash::signature::Signature; -use std::path::Path; use std::sync::atomic; use std::sync::atomic::AtomicUsize; @@ -117,7 +117,7 @@ fn build_siginfo( let sig = Signature::builder() .hash_function("0.murmur64") .name(Some(name.to_string())) - .filename(Some(filename.to_string_lossy().into_owned())) + .filename(Some(filename.to_string())) .signatures(template) .build(); sigs.push(sig); @@ -128,12 +128,12 @@ fn build_siginfo( (sigs, params_vec) } -pub fn manysketch + Sync>( - filelist: P, +pub fn manysketch( + filelist: String, param_str: String, output: String, ) -> Result<(), Box> { - let fileinfo = match load_fasta_fromfile(&filelist) { + let fileinfo = match load_fasta_fromfile(filelist) { Ok(result) => result, Err(e) => bail!("Could not load fromfile csv. Underlying error: {}", e), }; @@ -206,7 +206,7 @@ pub fn manysketch + Sync>( let mut reader = match parse_fastx_file(filename) { Ok(r) => r, Err(err) => { - eprintln!("Error opening file {}: {:?}", filename.display(), err); + eprintln!("Error opening file {}: {:?}", filename, err); let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); return None; } diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 6a80a647..6755d54a 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -6,7 +6,7 @@ use sourmash::sketch::Sketch; use std::path::Path; // use camino::Utf8Path as Path; -// use camino::Utf8PathBuf as PathBuf; +use camino::Utf8PathBuf as PathBuf; use sourmash::prelude::*; @@ -21,11 +21,12 @@ use std::io::{BufWriter, Write}; use crate::utils::{is_revindex_database, load_collection, ReportType}; pub fn mastiff_manygather>( - queries_file: camino::Utf8PathBuf, - index: camino::Utf8PathBuf, + queries_file: String, + index: PathBuf, selection: &Selection, threshold_bp: usize, output: Option

, + allow_failed_sigpaths: bool, ) -> Result<(), Box> { if !is_revindex_database(&index) { bail!("'{}' is not a valid RevIndex database", index); @@ -34,7 +35,12 @@ pub fn mastiff_manygather>( let db = RevIndex::open(index, true)?; println!("Loaded DB"); - let query_collection = load_collection(&queries_file, selection, ReportType::Query)?; + let query_collection = load_collection( + &queries_file, + selection, + ReportType::Query, + allow_failed_sigpaths, + )?; // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index 24fff34e..4ef68830 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -1,11 +1,11 @@ /// mastiff_manysearch: mastiff-indexed version of manysearch. use anyhow::Result; +use camino::Utf8PathBuf as PathBuf; use rayon::prelude::*; use sourmash::index::revindex::{RevIndex, RevIndexOps}; use sourmash::selection::Selection; use sourmash::signature::SigsTrait; use sourmash::sketch::Sketch; -use std::path::Path; use std::sync::atomic; use std::sync::atomic::AtomicUsize; @@ -13,12 +13,13 @@ use crate::utils::{ csvwriter_thread, is_revindex_database, load_collection, ReportType, SearchResult, }; -pub fn mastiff_manysearch>( - queries_path: camino::Utf8PathBuf, - index: camino::Utf8PathBuf, +pub fn mastiff_manysearch( + queries_path: String, + index: PathBuf, selection: &Selection, minimum_containment: f64, - output: Option

, + output: Option, + allow_failed_sigpaths: bool, ) -> Result<(), Box> { if !is_revindex_database(&index) { bail!("'{}' is not a valid RevIndex database", index); @@ -28,7 +29,12 @@ pub fn mastiff_manysearch>( println!("Loaded DB"); // Load query paths - let query_collection = load_collection(&queries_path, selection, ReportType::Query)?; + let query_collection = load_collection( + &queries_path, + selection, + ReportType::Query, + allow_failed_sigpaths, + )?; // if query_paths is empty, exit with error. this should already happen via load_collection, i think? if query_collection.len() == 0 { @@ -39,7 +45,7 @@ pub fn mastiff_manysearch>( let (send, recv) = std::sync::mpsc::sync_channel::(rayon::current_num_threads()); // & spawn a thread that is dedicated to printing to a buffered output - let thrd = csvwriter_thread(recv, output.as_ref()); + let thrd = csvwriter_thread(recv, output); // // Main loop: iterate (in parallel) over all search signature paths, diff --git a/src/multisearch.rs b/src/multisearch.rs index 0d772276..ad28c6ab 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -19,20 +19,31 @@ use crate::utils::{load_collection, load_mh_with_name_and_md5, ReportType}; /// database once. pub fn multisearch( - query_filepath: &camino::Utf8PathBuf, - against_filepath: &camino::Utf8PathBuf, + query_filepath: String, + against_filepath: String, threshold: f64, selection: &Selection, output: Option, + allow_failed_sigpaths: bool, ) -> Result<(), Box> { // Load all queries into memory at once. - let query_collection = load_collection(query_filepath, selection, ReportType::Query)?; + let query_collection = load_collection( + &query_filepath, + selection, + ReportType::Query, + allow_failed_sigpaths, + )?; let queries = load_mh_with_name_and_md5(query_collection, &selection, ReportType::Query).unwrap(); // Load all against sketches into memory at once. - let against_collection = load_collection(against_filepath, selection, ReportType::Against)?; + let against_collection = load_collection( + &against_filepath, + selection, + ReportType::Against, + allow_failed_sigpaths, + )?; let against = load_mh_with_name_and_md5(against_collection, &selection, ReportType::Against).unwrap(); diff --git a/src/pairwise.rs b/src/pairwise.rs index b6713d41..c714f9c8 100644 --- a/src/pairwise.rs +++ b/src/pairwise.rs @@ -19,21 +19,27 @@ use sourmash::selection::Selection; /// Note: this function loads all _signatures_ into memory. pub fn pairwise>( - sigpath: &camino::Utf8PathBuf, + siglist: String, threshold: f64, selection: &Selection, output: Option

, + allow_failed_sigpaths: bool, ) -> Result<(), Box> { // Load all sigs into memory at once. - let collection = load_collection(sigpath, selection, ReportType::Pairwise)?; + let collection = load_collection( + &siglist, + selection, + ReportType::General, + allow_failed_sigpaths, + )?; if collection.len() <= 1 { bail!( "Pairwise requires two or more sketches. Check input: '{:?}'", - &sigpath + &siglist ) } - let sketches = load_mh_with_name_and_md5(collection, &selection, ReportType::Pairwise).unwrap(); + let sketches = load_mh_with_name_and_md5(collection, &selection, ReportType::General).unwrap(); // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index eeb8f76a..432d7630 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -89,22 +89,22 @@ def test_index_missing_siglist(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error loading siglist' in captured.err + assert 'Error: No such file or directory' in captured.err -def test_index_bad_siglist(runtmp, capfd): - # test index with a bad siglist (.sig.gz file instead of pathlist) +def test_index_sig(runtmp, capfd): + # test index with a .sig.gz file instead of pathlist + # (should work now) sig2 = get_test_data('2.fa.sig.gz') output = runtmp.output('out.db') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'index', sig2, + runtmp.sourmash('scripts', 'index', sig2, '-o', output) captured = capfd.readouterr() print(captured.err) - assert 'Error loading siglist' in captured.err print(runtmp.last_result.err) + assert 'index is done' in runtmp.last_result.err def test_index_bad_siglist_2(runtmp, capfd): @@ -124,28 +124,25 @@ def test_index_bad_siglist_2(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error processing "no-exist"' in captured.err + assert "WARNING: could not load sketches from path 'no-exist'" in captured.err def test_index_empty_siglist(runtmp, capfd): - ## TODO: index:: do not write output if no signatures to write? - # OR, warn user? - # test empty siglist file siglist = runtmp.output('db-sigs.txt') output = runtmp.output('out.db') make_file_list(siglist, []) # empty - # with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'index', siglist, + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'index', siglist, '-o', output) captured = capfd.readouterr() - assert os.path.exists(output) # do we want an empty file, or no file? + assert not os.path.exists(output) # do we want an empty file, or no file? print(runtmp.last_result.out) print(runtmp.last_result.err) print(captured.err) - # assert "No signatures to index loaded, exiting." in captured.err + assert "Error: Signatures failed to load. Exiting." in captured.err def test_index_nomatch_sig_in_siglist(runtmp, capfd): diff --git a/src/utils.rs b/src/utils.rs index 5cd49de1..b31ba31b 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -7,7 +7,9 @@ use sourmash::selection::Select; use std::fs::{create_dir_all, File}; use std::io::{BufRead, BufReader, BufWriter, Write}; use std::panic; -use std::path::{Path, PathBuf}; +// use std::path::{Path, PathBuf}; +use camino::Utf8Path as Path; +use camino::Utf8PathBuf as PathBuf; use std::sync::atomic; use std::sync::atomic::AtomicUsize; @@ -126,34 +128,32 @@ pub fn write_prefetch( } /// Load a list of filenames from a file. Exits on bad lines. -pub fn load_sketchlist_filenames>(sketchlist_filename: &P) -> Result> { - let sketchlist_file = BufReader::new(File::open(sketchlist_filename)?); - - let mut sketchlist_filenames: Vec = Vec::new(); - for line in sketchlist_file.lines() { - let line = match line { - Ok(v) => v, - Err(_) => { - return { - let filename = sketchlist_filename.as_ref().display(); - let msg = format!("invalid line in fromfile '{}'", filename); - Err(anyhow!(msg)) - } - } - }; - - if !line.is_empty() { - let mut path = PathBuf::new(); - path.push(line); - sketchlist_filenames.push(path); - } - } - Ok(sketchlist_filenames) -} - -pub fn load_fasta_fromfile>( - sketchlist_filename: &P, -) -> Result> { +// pub fn load_sketchlist_filenames>(sketchlist_filename: &P) -> Result> { +// let sketchlist_file = BufReader::new(File::open(sketchlist_filename)?); + +// let mut sketchlist_filenames: Vec = Vec::new(); +// for line in sketchlist_file.lines() { +// let line = match line { +// Ok(v) => v, +// Err(_) => { +// return { +// let filename = sketchlist_filename.as_ref().display(); +// let msg = format!("invalid line in fromfile '{}'", filename); +// Err(anyhow!(msg)) +// } +// } +// }; + +// if !line.is_empty() { +// let mut path = PathBuf::new(); +// path.push(line); +// sketchlist_filenames.push(path); +// } +// } +// Ok(sketchlist_filenames) +// } + +pub fn load_fasta_fromfile(sketchlist_filename: String) -> Result> { let mut rdr = csv::Reader::from_path(sketchlist_filename)?; // Check for right header @@ -313,7 +313,7 @@ pub fn load_sketches_above_threshold( pub enum ReportType { Query, Against, - Pairwise, + General, } impl std::fmt::Display for ReportType { @@ -321,19 +321,22 @@ impl std::fmt::Display for ReportType { let description = match self { ReportType::Query => "query", ReportType::Against => "search", - ReportType::Pairwise => "signature", + ReportType::General => "signature", }; write!(f, "{}", description) } } pub fn load_collection( - sigpath: &camino::Utf8PathBuf, + siglist: &String, selection: &Selection, report_type: ReportType, + allow_failed: bool, ) -> Result { + let sigpath = PathBuf::from(siglist); + if !sigpath.exists() { - bail!("No such file or directory: '{}'", sigpath); + bail!("No such file or directory: '{}'", &sigpath); } let mut n_failed = 0; @@ -344,7 +347,7 @@ pub fn load_collection( } } else { // if pathlist is just a signature path, load it into a collection - match Signature::from_path(sigpath) { + match Signature::from_path(&sigpath) { Ok(signatures) => { // Load the collection from the signature match Collection::from_sigs(signatures) { @@ -358,7 +361,7 @@ pub fn load_collection( } // if not, try to load file as list of sig paths Err(_) => { - // // using core fn doesn't allow us to ignore failed paths; I reimplement loading here to allow + // using core fn doesn't allow us to ignore failed paths; I reimplement loading here to allow let sketchlist_file = BufReader::new(File::open(sigpath)?); let records: Vec = sketchlist_file .lines() @@ -400,7 +403,7 @@ pub fn load_collection( let n_total = collection.len(); let selected = collection.select(selection)?; let n_skipped = n_total - selected.len(); - report_on_collection_loading(&selected, n_skipped, n_failed, report_type)?; + report_on_collection_loading(&selected, n_skipped, n_failed, report_type, allow_failed)?; Ok(selected) } @@ -431,12 +434,16 @@ pub fn report_on_collection_loading( skipped_paths: usize, failed_paths: usize, report_type: ReportType, + allow_failed: bool, ) -> Result<()> { if failed_paths > 0 { eprintln!( "WARNING: {} {} paths failed to load. See error messages above.", failed_paths, report_type ); + if !allow_failed { + bail! {"Signatures failed to load. Exiting."} + } } if skipped_paths > 0 { eprintln!( @@ -715,21 +722,21 @@ pub fn make_manifest_row( n_hashes: sketch.size(), with_abundance: abund, name: sig.name().to_string(), - // filename: filename.display().to_string(), - filename: filename.to_str().unwrap().to_string(), + filename: filename.to_string(), } } -pub fn open_stdout_or_file>(output: Option

) -> Box { +pub fn open_stdout_or_file(output: Option) -> Box { // if output is a file, use open_output_file if let Some(path) = output { - Box::new(open_output_file(&path)) + let outpath: PathBuf = path.into(); + Box::new(open_output_file(&outpath)) } else { Box::new(std::io::stdout()) } } -pub fn open_output_file>(output: &P) -> BufWriter { +pub fn open_output_file(output: &PathBuf) -> BufWriter { let file = File::create(output).unwrap_or_else(|e| { eprintln!("Error creating output file: {:?}", e); std::process::exit(1); @@ -772,7 +779,10 @@ pub fn sigwriter + Send + 'static>( output: String, ) -> std::thread::JoinHandle> { std::thread::spawn(move || -> Result<()> { - let file_writer = open_output_file(&output); + // cast output as pathbuf + let outpath: PathBuf = output.into(); + + let file_writer = open_output_file(&outpath); let options = zip::write::FileOptions::default() .compression_method(zip::CompressionMethod::Stored) @@ -845,16 +855,15 @@ pub trait ResultType { fn format_fields(&self) -> Vec; } -pub fn csvwriter_thread( +pub fn csvwriter_thread( recv: std::sync::mpsc::Receiver, - output: Option

, + output: Option, ) -> std::thread::JoinHandle<()> where T: ResultType, - P: Clone + std::convert::AsRef, { // create output file - let out = open_stdout_or_file(output.as_ref()); + let out = open_stdout_or_file(output); // spawn a thread that is dedicated to printing to a buffered output std::thread::spawn(move || { let mut writer = out; From f769aee8ea297b4b4cf4b01b0d28f49b2326db9d Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 1 Feb 2024 19:55:42 -0800 Subject: [PATCH 31/47] unify more code --- src/fastgather.rs | 14 +--- src/fastmultigather.rs | 120 +++++++++++---------------- src/lib.rs | 1 - src/mastiff_manysearch.rs | 5 -- src/python/tests/test_multigather.py | 6 +- src/python/tests/test_multisearch.py | 10 ++- src/python/tests/test_search.py | 13 +-- src/utils.rs | 67 +++++---------- 8 files changed, 90 insertions(+), 146 deletions(-) diff --git a/src/fastgather.rs b/src/fastgather.rs index 280afd54..ab9a55a8 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -15,7 +15,6 @@ pub fn fastgather( query_filepath: String, against_filepath: String, threshold_bp: usize, - ksize: u8, scaled: usize, selection: &Selection, gather_output: Option, @@ -36,9 +35,8 @@ pub fn fastgather( ) } // get single query sig and minhash - let query_sig = query_collection.sig_for_dataset(0)?; // need original md5sum, etc - // downsample - let query_sig_ds = query_sig.clone().select(selection)?; + let query_sig = query_collection.sig_for_dataset(0)?; // need this for original md5sum + let query_sig_ds = query_sig.clone().select(selection)?; // downsample let query_mh = match query_sig_ds.minhash() { Some(query_mh) => query_mh, None => { @@ -98,12 +96,6 @@ pub fn fastgather( } // run the gather! - consume_query_by_gather( - query_sig.clone(), - matchlist, - threshold_hashes, - gather_output, - ) - .ok(); + consume_query_by_gather(query_sig, matchlist, threshold_hashes, gather_output).ok(); Ok(()) } diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 6fb1c932..1283fcc8 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -3,8 +3,6 @@ use anyhow::Result; use rayon::prelude::*; use sourmash::selection::Selection; -use sourmash::sketch::Sketch; -use sourmash::storage::SigStore; use std::sync::atomic; use std::sync::atomic::AtomicUsize; @@ -14,7 +12,8 @@ use std::collections::BinaryHeap; use camino::Utf8Path; use crate::utils::{ - consume_query_by_gather, load_collection, write_prefetch, PrefetchResult, ReportType, + consume_query_by_gather, load_collection, load_mh_with_name_and_md5, write_prefetch, + PrefetchResult, ReportType, }; pub fn fastmultigather( @@ -32,7 +31,6 @@ pub fn fastmultigather( ReportType::Query, allow_failed_sigpaths, )?; - println!("Loaded {} sig paths in querylist", query_collection.len()); let threshold_hashes: u64 = { let x = threshold_bp / scaled; @@ -55,90 +53,70 @@ pub fn fastmultigather( allow_failed_sigpaths, )?; // load actual signatures - let mut sketchlist: Vec = vec![]; - - for (idx, record) in against_collection.iter() { - if let Ok(sig) = against_collection.sig_for_dataset(idx) - // .unwrap() - // .select(&selection) // if we select here, we downsample and the md5sum changes! - // ...which means we would lose the original md5sum that is used in the standard gather results. - { - sketchlist.push(sig); - } else { - eprintln!("Failed to load 'against' record: {}", record.name()); - } - } + let against = + load_mh_with_name_and_md5(against_collection, &selection, ReportType::Against).unwrap(); // Iterate over all queries => do prefetch and gather! let processed_queries = AtomicUsize::new(0); let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - query_collection.par_iter().for_each(|(idx, record)| { - // increment counter of # of queries. q: could we instead use the index from par_iter()? + query_collection.par_iter().for_each(|(_idx, record)| { + // increment counter of # of queries. q: could we instead use the _idx from par_iter(), or will it vary based on thread? let _i = processed_queries.fetch_add(1, atomic::Ordering::SeqCst); // Load query sig - match query_collection.sig_for_dataset(idx) { + match query_collection.sig_from_record(record) { Ok(query_sig) => { let prefix = query_sig.name(); let location = Utf8Path::new(&prefix).file_name().unwrap(); - for sketch in query_sig.iter() { - // Access query MinHash - if let Sketch::MinHash(query) = sketch { - let matchlist: BinaryHeap = sketchlist - .iter() - .filter_map(|sm| { - let mut mm = None; - // Access against MinHash - if let Some(sketch) = sm.sketches().get(0) { - if let Sketch::MinHash(against_sketch) = sketch { - if let Ok(overlap) = - // downsample here to just get downsampled mh and avoid changing md5sum - against_sketch.count_common(&query, true) - { - if overlap >= threshold_hashes { - let result = PrefetchResult { - name: sm.name(), - md5sum: sm.md5sum().clone(), - minhash: against_sketch.clone(), - overlap, - }; - mm = Some(result); - } - } - } + if let Some(query_mh) = query_sig.minhash() { + let matchlist: BinaryHeap = against + .iter() + .filter_map(|(against_mh, against_name, against_md5)| { + let mut mm = None; + if let Ok(overlap) = against_mh.count_common(&query_mh, false) { + if overlap >= threshold_hashes { + let result = PrefetchResult { + name: against_name.clone(), + md5sum: against_md5.clone(), + minhash: against_mh.clone(), + overlap, + }; + mm = Some(result); } - mm - }) - .collect(); - if !matchlist.is_empty() { - let prefetch_output = format!("{}.prefetch.csv", location); - let gather_output = format!("{}.gather.csv", location); - - // Save initial list of matches to prefetch output - write_prefetch(&query_sig, Some(prefetch_output), &matchlist).ok(); - - // Now, do the gather! - consume_query_by_gather( - query_sig.clone(), - matchlist, - threshold_hashes, - Some(gather_output), - ) - .ok(); - } else { - println!("No matches to '{}'", location); - } + } + mm + }) + .collect(); + if !matchlist.is_empty() { + let prefetch_output = format!("{}.prefetch.csv", location); + let gather_output = format!("{}.gather.csv", location); + + // Save initial list of matches to prefetch output + write_prefetch(&query_sig, Some(prefetch_output), &matchlist).ok(); + + // Now, do the gather! + consume_query_by_gather( + query_sig.clone(), + matchlist, + threshold_hashes, + Some(gather_output), + ) + .ok(); } else { - eprintln!( - "WARNING: no compatible sketches in path '{}'", - record.internal_location() - ); - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); + println!("No matches to '{}'", location); } + } else { + // different warning here? Could not load sig from record?? + eprintln!( + "WARNING: no compatible sketches in path '{}'", + record.internal_location() + ); + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } } Err(_) => { + // different warning here? Could not load sig from record?? eprintln!( "WARNING: no compatible sketches in path '{}'", record.internal_location() diff --git a/src/lib.rs b/src/lib.rs index ab178564..8d427f41 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -87,7 +87,6 @@ fn do_fastgather( query_filename, siglist_path, threshold_bp, - ksize, scaled, &selection, output_path_prefetch, diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index 4ef68830..5bf716a8 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -36,11 +36,6 @@ pub fn mastiff_manysearch( allow_failed_sigpaths, )?; - // if query_paths is empty, exit with error. this should already happen via load_collection, i think? - if query_collection.len() == 0 { - bail!("No query signatures loaded, exiting."); - } - // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel::(rayon::current_num_threads()); diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_multigather.py index 7ec636ba..1f96eed1 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_multigather.py @@ -424,6 +424,7 @@ def test_bad_against_2(runtmp, capfd, zip_query): def test_empty_against(runtmp, capfd): + # like fastgather - exit gracefully. # test bad 'against' file - in this case, an empty one query = get_test_data('SRR606249.sig.gz') query_list = runtmp.output('query.txt') @@ -432,15 +433,14 @@ def test_empty_against(runtmp, capfd): against_list = runtmp.output('against.txt') make_file_list(against_list, []) - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, + runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, '-s', '100000') captured = capfd.readouterr() print(captured.err) assert "Sketch loading error: No such file or directory" in captured.err - assert "Error: No search signatures loaded, exiting." in captured.err + assert "No search signatures loaded, exiting." in captured.err @pytest.mark.parametrize('zip_against', [False, True]) diff --git a/src/python/tests/test_multisearch.py b/src/python/tests/test_multisearch.py index ff2136b0..a7b09931 100644 --- a/src/python/tests/test_multisearch.py +++ b/src/python/tests/test_multisearch.py @@ -300,8 +300,8 @@ def test_bad_against(runtmp, capfd): assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err -def test_empty_query(runtmp): - # test with an empty query list +def test_empty_query(runtmp, capfd): + # test with an empty query list - fail gracefully query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -314,11 +314,13 @@ def test_empty_query(runtmp): output = runtmp.output('out.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, + runtmp.sourmash('scripts', 'multisearch', query_list, against_list, '-o', output) print(runtmp.last_result.err) + captured = capfd.readouterr() + print(captured.err) + assert "No query signatures loaded, exiting." in captured.err # @CTB diff --git a/src/python/tests/test_search.py b/src/python/tests/test_search.py index 2ab45907..c6c49c95 100644 --- a/src/python/tests/test_search.py +++ b/src/python/tests/test_search.py @@ -371,8 +371,8 @@ def test_nomatch_against(runtmp, capfd): output = runtmp.output('out.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysearch', query_list, against_list, + # with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'manysearch', query_list, against_list, '-o', output) captured = capfd.readouterr() @@ -403,7 +403,7 @@ def test_bad_against(runtmp, capfd): @pytest.mark.parametrize("indexed", [False, True]) -def test_empty_query(runtmp, indexed): +def test_empty_query(runtmp, indexed, capfd): # test with an empty query list query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -420,11 +420,14 @@ def test_empty_query(runtmp, indexed): output = runtmp.output('out.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysearch', query_list, against_list, + # with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'manysearch', query_list, against_list, '-o', output) print(runtmp.last_result.err) + captured = capfd.readouterr() + print(captured.err) + assert "No query signatures loaded, exiting." in captured.err @pytest.mark.parametrize("indexed", [False, True]) diff --git a/src/utils.rs b/src/utils.rs index b31ba31b..33e9b3dc 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -127,32 +127,6 @@ pub fn write_prefetch( Ok(()) } -/// Load a list of filenames from a file. Exits on bad lines. -// pub fn load_sketchlist_filenames>(sketchlist_filename: &P) -> Result> { -// let sketchlist_file = BufReader::new(File::open(sketchlist_filename)?); - -// let mut sketchlist_filenames: Vec = Vec::new(); -// for line in sketchlist_file.lines() { -// let line = match line { -// Ok(v) => v, -// Err(_) => { -// return { -// let filename = sketchlist_filename.as_ref().display(); -// let msg = format!("invalid line in fromfile '{}'", filename); -// Err(anyhow!(msg)) -// } -// } -// }; - -// if !line.is_empty() { -// let mut path = PathBuf::new(); -// path.push(line); -// sketchlist_filenames.push(path); -// } -// } -// Ok(sketchlist_filenames) -// } - pub fn load_fasta_fromfile(sketchlist_filename: String) -> Result> { let mut rdr = csv::Reader::from_path(sketchlist_filename)?; @@ -265,27 +239,27 @@ pub fn load_sketches_above_threshold( let mut results = Vec::new(); // Load against into memory if let Ok(against_sig) = against_collection.sig_from_record(against_record) { - for sketch in against_sig.sketches() { - if let Sketch::MinHash(against_mh) = sketch { - // currently downsampling here to avoid changing md5sum - if let Ok(overlap) = against_mh.count_common(query, true) { - if overlap >= threshold_hashes { - let result = PrefetchResult { - name: against_record.name().to_string(), - md5sum: against_mh.md5sum(), - minhash: against_mh.clone(), - overlap, - }; - results.push(result); - } + if let Some(against_mh) = against_sig.minhash() { + // if let Some(against_mh) = against_sig.select(&selection).unwrap().minhash() { // downsample via select + // currently downsampling here to avoid changing md5sum + if let Ok(overlap) = against_mh.count_common(query, true) { + //downsample via count_common + if overlap >= threshold_hashes { + let result = PrefetchResult { + name: against_record.name().to_string(), + md5sum: against_mh.md5sum(), + minhash: against_mh.clone(), + overlap, + }; + results.push(result); } - } else { - eprintln!( - "WARNING: no compatible sketches in path '{}'", - against_sig.filename() - ); - let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } + } else { + eprintln!( + "WARNING: no compatible sketches in path '{}'", + against_sig.filename() + ); + let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } } else { // this shouldn't happen here anymore -- likely would happen at load_collection @@ -454,7 +428,8 @@ pub fn report_on_collection_loading( // Validate sketches if collection.is_empty() { - bail!("No {} signatures loaded, exiting.", report_type); + eprintln!("No {} signatures loaded, exiting.", report_type); + return Ok(()); } eprintln!("Loaded {} {} signature(s)", collection.len(), report_type); Ok(()) From 8d7781c8b7966e1f8c041ec9ce4e2b4d96b6eb10 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 1 Feb 2024 20:07:12 -0800 Subject: [PATCH 32/47] rm unused save_paths option --- src/fastmultigather.rs | 12 ++++++------ src/index.rs | 1 - src/lib.rs | 10 +--------- src/python/sourmash_plugin_branchwater/__init__.py | 3 --- 4 files changed, 7 insertions(+), 19 deletions(-) diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 1283fcc8..dc10e897 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -9,7 +9,7 @@ use std::sync::atomic::AtomicUsize; use std::collections::BinaryHeap; -use camino::Utf8Path; +use camino::Utf8Path as PathBuf; use crate::utils::{ consume_query_by_gather, load_collection, load_mh_with_name_and_md5, write_prefetch, @@ -24,7 +24,7 @@ pub fn fastmultigather( selection: &Selection, allow_failed_sigpaths: bool, ) -> Result<()> { - // load the list of query paths + // load query collection let query_collection = load_collection( &query_filepath, selection, @@ -45,14 +45,14 @@ pub fn fastmultigather( println!("threshold overlap: {} {}", threshold_hashes, threshold_bp); - // Load all the against sketches + // load against collection let against_collection = load_collection( &against_filepath, selection, ReportType::Against, allow_failed_sigpaths, )?; - // load actual signatures + // load against sketches into memory, downsampling on the way let against = load_mh_with_name_and_md5(against_collection, &selection, ReportType::Against).unwrap(); @@ -64,11 +64,11 @@ pub fn fastmultigather( query_collection.par_iter().for_each(|(_idx, record)| { // increment counter of # of queries. q: could we instead use the _idx from par_iter(), or will it vary based on thread? let _i = processed_queries.fetch_add(1, atomic::Ordering::SeqCst); - // Load query sig + // Load query sig (downsampling happens here) match query_collection.sig_from_record(record) { Ok(query_sig) => { let prefix = query_sig.name(); - let location = Utf8Path::new(&prefix).file_name().unwrap(); + let location = PathBuf::new(&prefix).file_name().unwrap(); if let Some(query_mh) = query_sig.minhash() { let matchlist: BinaryHeap = against .iter() diff --git a/src/index.rs b/src/index.rs index 6fa7e898..0ed0a230 100644 --- a/src/index.rs +++ b/src/index.rs @@ -8,7 +8,6 @@ pub fn index>( siglist: String, selection: &Selection, output: P, - save_paths: bool, colors: bool, allow_failed_sigpaths: bool, ) -> Result<(), Box> { diff --git a/src/lib.rs b/src/lib.rs index 8d427f41..16df3ae4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -173,19 +173,11 @@ fn do_index( scaled: usize, moltype: String, output: String, - save_paths: bool, colors: bool, ) -> anyhow::Result { let selection = build_selection(ksize, scaled, &moltype); let allow_failed_sigpaths = false; - match index::index( - siglist, - &selection, - output, - save_paths, - colors, - allow_failed_sigpaths, - ) { + match index::index(siglist, &selection, output, colors, allow_failed_sigpaths) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index 6aff91b3..def6fec7 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -189,8 +189,6 @@ def __init__(self, p): help='scaled factor at which to do comparisons') p.add_argument('-m', '--moltype', default='DNA', choices = ["DNA", "protein", "dayhoff", "hp"], help = 'molecule type (DNA, protein, dayhoff, or hp; default DNA)') - p.add_argument('--save-paths', action='store_true', - help='save paths to signatures into index. Default: save full sig into index') p.add_argument('-c', '--cores', default=0, type=int, help='number of cores to use (default is all available)') @@ -208,7 +206,6 @@ def main(self, args): args.scaled, args.moltype, args.output, - args.save_paths, False) # colors - currently must be false? if status == 0: notify(f"...index is done! results in '{args.output}'") From b6ebc7a18dbfe57b21ac255c822bf78ba49fdd90 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 1 Feb 2024 20:23:16 -0800 Subject: [PATCH 33/47] use updated mh loading --- src/manysearch.rs | 92 ++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 57 deletions(-) diff --git a/src/manysearch.rs b/src/manysearch.rs index fa7c4db8..099b451d 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -15,7 +15,9 @@ use sourmash::storage::SigStore; use std::sync::atomic; use std::sync::atomic::AtomicUsize; -use crate::utils::{csvwriter_thread, load_collection, ReportType, SearchResult}; +use crate::utils::{ + csvwriter_thread, load_collection, load_mh_with_name_and_md5, ReportType, SearchResult, +}; pub fn manysearch( query_filepath: String, @@ -35,22 +37,11 @@ pub fn manysearch( ReportType::Query, allow_failed_sigpaths, )?; - // load actual signatures - let mut query_sketchlist: Vec = vec![]; - - for (idx, record) in query_collection.iter() { - if let Ok(sig) = query_collection - .sig_for_dataset(idx) - .unwrap() - .select(&selection) - { - query_sketchlist.push(sig); - } else { - eprintln!("Failed to load 'query' sig: {}", record.name()); - } - } + // load query sketches into memory, downsampling on the way + let query_sketchlist = + load_mh_with_name_and_md5(query_collection, &selection, ReportType::Query).unwrap(); - // Load all _paths_, not signatures, into memory. + // Against: Load all _paths_, not signatures, into memory. let against_collection = load_collection( &against_filepath, selection, @@ -76,7 +67,7 @@ pub fn manysearch( let send = against_collection .par_iter() - .filter_map(|(idx, record)| { + .filter_map(|(_idx, record)| { let i = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst); if i % 1000 == 0 { eprintln!("Processed {} search sigs", i); @@ -84,54 +75,41 @@ pub fn manysearch( let mut results = vec![]; - match against_collection.sig_for_dataset(idx) { - Ok(against_sig) => match against_sig.select(selection) { - Ok(against_sig) => { - for sketch in against_sig.iter() { - if let Sketch::MinHash(against_mh) = sketch { - for query_sig in query_sketchlist.iter() { - for sketch in query_sig.iter() { - if let Sketch::MinHash(query_mh) = sketch { - let overlap = - query_mh.count_common(&against_mh, false).unwrap() - as f64; - let query_size = query_mh.size() as f64; - let target_size = against_mh.size() as f64; - - let containment_query_in_target = overlap / query_size; - let containment_in_target = overlap / target_size; - let max_containment = containment_query_in_target - .max(containment_in_target); - let jaccard = - overlap / (target_size + query_size - overlap); - - if containment_query_in_target > threshold { - results.push(SearchResult { - query_name: query_sig.name(), - query_md5: query_mh.md5sum(), - match_name: against_sig.name(), - containment: containment_query_in_target, - intersect_hashes: overlap as usize, - match_md5: Some(against_mh.md5sum()), - jaccard: Some(jaccard), - max_containment: Some(max_containment), - }); - } - } - } - } + // against downsampling happens here + match against_collection.sig_from_record(record) { + Ok(against_sig) => { + if let Some(against_mh) = against_sig.minhash() { + for (query_mh, query_name, query_md5) in query_sketchlist.iter() { + let overlap = query_mh.count_common(&against_mh, false).unwrap() as f64; + let query_size = query_mh.size() as f64; + let target_size = against_mh.size() as f64; + let containment_query_in_target = overlap / query_size; + let containment_in_target = overlap / target_size; + let max_containment = + containment_query_in_target.max(containment_in_target); + let jaccard = overlap / (target_size + query_size - overlap); + + if containment_query_in_target > threshold { + results.push(SearchResult { + query_name: query_name.clone(), + query_md5: query_md5.clone(), + match_name: against_sig.name(), + containment: containment_query_in_target, + intersect_hashes: overlap as usize, + match_md5: Some(against_sig.md5sum()), + jaccard: Some(jaccard), + max_containment: Some(max_containment), + }); } } - } - Err(err) => { - eprintln!("Sketch selection error: {}", err); + } else { eprintln!( "WARNING: no compatible sketches in path '{}'", record.internal_location() ); let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - }, + } Err(err) => { eprintln!("Sketch loading error: {}", err); eprintln!( From a463ac88c5597887c59465e4f7f0f68d0a346e42 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 1 Feb 2024 21:13:46 -0800 Subject: [PATCH 34/47] standardize indexed writing using local struct for now --- src/manysearch.rs | 7 +-- src/mastiff_manygather.rs | 115 ++++++++++++++------------------------ src/mastiff_manysearch.rs | 3 +- src/utils.rs | 40 ++++++++++++- 4 files changed, 84 insertions(+), 81 deletions(-) diff --git a/src/manysearch.rs b/src/manysearch.rs index 099b451d..767bb7d2 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -27,17 +27,14 @@ pub fn manysearch( output: Option, allow_failed_sigpaths: bool, ) -> Result<()> { - // Read in list of query paths. - eprintln!("Reading queries from: '{}'", query_filepath); - - // Load all query sigs into memory at once. + // Load query collection let query_collection = load_collection( &query_filepath, selection, ReportType::Query, allow_failed_sigpaths, )?; - // load query sketches into memory, downsampling on the way + // load all query sketches into memory, downsampling on the way let query_sketchlist = load_mh_with_name_and_md5(query_collection, &selection, ReportType::Query).unwrap(); diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 6755d54a..48eb61c8 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -1,31 +1,22 @@ /// mastiff_manygather: mastiff-indexed version of fastmultigather. use anyhow::Result; -use rayon::prelude::*; - -use sourmash::sketch::Sketch; -use std::path::Path; - -// use camino::Utf8Path as Path; use camino::Utf8PathBuf as PathBuf; - -use sourmash::prelude::*; - +use rayon::prelude::*; use sourmash::index::revindex::{RevIndex, RevIndexOps}; - +use sourmash::prelude::*; use std::sync::atomic; use std::sync::atomic::AtomicUsize; -use std::fs::File; -use std::io::{BufWriter, Write}; - -use crate::utils::{is_revindex_database, load_collection, ReportType}; +use crate::utils::{ + csvwriter_thread, is_revindex_database, load_collection, BranchwaterGatherResult, ReportType, +}; -pub fn mastiff_manygather>( +pub fn mastiff_manygather( queries_file: String, index: PathBuf, selection: &Selection, threshold_bp: usize, - output: Option

, + output: Option, allow_failed_sigpaths: bool, ) -> Result<(), Box> { if !is_revindex_database(&index) { @@ -43,29 +34,12 @@ pub fn mastiff_manygather>( )?; // set up a multi-producer, single-consumer channel. - let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); + // let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); + let (send, recv) = + std::sync::mpsc::sync_channel::(rayon::current_num_threads()); // & spawn a thread that is dedicated to printing to a buffered output - let out: Box = match output { - Some(path) => Box::new(BufWriter::new(File::create(path).unwrap())), - None => Box::new(std::io::stdout()), - }; - let thrd = std::thread::spawn(move || { - let mut writer = BufWriter::new(out); - writeln!( - &mut writer, - "query_name,query_md5,match_name,match_md5,f_match_query,intersect_bp" - ) - .unwrap(); - for (query, query_md5, m, m_md5, f_match_query, intersect_bp) in recv.into_iter() { - writeln!( - &mut writer, - "\"{}\",{},\"{}\",{},{},{}", - query, query_md5, m, m_md5, f_match_query, intersect_bp - ) - .ok(); - } - }); + let thrd = csvwriter_thread(recv, output); // // Main loop: iterate (in parallel) over all search signature paths, @@ -79,47 +53,44 @@ pub fn mastiff_manygather>( let send = query_collection .par_iter() - .filter_map(|(idx, record)| { + .filter_map(|(_idx, record)| { let threshold = threshold_bp / selection.scaled()? as usize; - match query_collection.sig_for_dataset(idx) { - // match query_collection.sig_from_record(record) { // to be added in core + // query downsampling happens here + match query_collection.sig_from_record(record) { Ok(query_sig) => { let mut results = vec![]; - let mut found_compatible_sketch = false; - for sketch in query_sig.iter() { - if let Sketch::MinHash(query) = sketch { - found_compatible_sketch = true; - // Gather! - let (counter, query_colors, hash_to_color) = - db.prepare_gather_counters(&query); - - let matches = db.gather( - counter, - query_colors, - hash_to_color, - threshold, - &query, - Some(selection.clone()), - ); - // extract results - if let Ok(matches) = matches { - for match_ in &matches { - results.push(( - query_sig.name().clone(), - query.md5sum().clone(), - match_.name().clone(), - match_.md5().clone(), - match_.f_match(), // f_match_query - match_.intersect_bp(), - )); // intersect_bp - } - } else { - eprintln!("Error gathering matches: {:?}", matches.err()); + if let Some(query_mh) = query_sig.minhash() { + // Gather! + let (counter, query_colors, hash_to_color) = + db.prepare_gather_counters(&query_mh); + + let matches = db.gather( + counter, + query_colors, + hash_to_color, + threshold, + &query_mh, + Some(selection.clone()), + ); + // extract results TODO: ADD REST OF GATHER COLUMNS + if let Ok(matches) = matches { + for match_ in &matches { + results.push( + (BranchwaterGatherResult { + query_name: query_sig.name().clone(), + query_md5: query_sig.md5sum().clone(), + match_name: match_.name().clone(), + match_md5: match_.md5().clone(), + f_match_query: match_.f_match(), + intersect_bp: match_.intersect_bp(), + }), + ); } + } else { + eprintln!("Error gathering matches: {:?}", matches.err()); } - } - if !found_compatible_sketch { + } else { eprintln!( "WARNING: no compatible sketches in path '{}'", query_sig.filename() diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index 5bf716a8..c2ddc8b4 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -10,7 +10,8 @@ use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{ - csvwriter_thread, is_revindex_database, load_collection, ReportType, SearchResult, + csvwriter_thread, is_revindex_database, load_collection, open_stdout_or_file, ReportType, + SearchResult, }; pub fn mastiff_manysearch( diff --git a/src/utils.rs b/src/utils.rs index 33e9b3dc..1cbdc41a 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -4,12 +4,11 @@ use sourmash::encodings::HashFunctions; use sourmash::manifest::Manifest; use sourmash::selection::Select; +use camino::Utf8Path as Path; +use camino::Utf8PathBuf as PathBuf; use std::fs::{create_dir_all, File}; use std::io::{BufRead, BufReader, BufWriter, Write}; use std::panic; -// use std::path::{Path, PathBuf}; -use camino::Utf8Path as Path; -use camino::Utf8PathBuf as PathBuf; use std::sync::atomic; use std::sync::atomic::AtomicUsize; @@ -199,6 +198,8 @@ pub fn load_fasta_fromfile(sketchlist_filename: String) -> Result Vec<&'static str> { + vec![ + "query_name", + "query_md5", + "match_name", + "match_md5", + "f_match_query", + "intersect_bp", + ] + } + fn format_fields(&self) -> Vec { + vec![ + format!("\"{}\"", self.query_name), // Wrap query_name with quotes + self.query_md5.clone(), + format!("\"{}\"", self.match_name), // Wrap match_name with quotes + self.match_md5.clone(), + self.f_match_query.to_string(), + self.intersect_bp.to_string(), + ] + } +} + pub struct ManifestRow { pub md5: String, pub md5short: String, From c7b865b458046ddb799f0cb303298253f4f9f0ad Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 1 Feb 2024 21:35:07 -0800 Subject: [PATCH 35/47] clean up sketch loading and file opening/writing --- src/mastiff_manygather.rs | 1 - src/mastiff_manysearch.rs | 64 ++++++++++++++++++--------------------- src/multisearch.rs | 61 ++++++++++++++----------------------- src/pairwise.rs | 64 +++++++++++++++------------------------ src/utils.rs | 38 +++++++++++++++++++++++ 5 files changed, 115 insertions(+), 113 deletions(-) diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 48eb61c8..8f19307e 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -34,7 +34,6 @@ pub fn mastiff_manygather( )?; // set up a multi-producer, single-consumer channel. - // let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); let (send, recv) = std::sync::mpsc::sync_channel::(rayon::current_num_threads()); diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index c2ddc8b4..cc5efd57 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -5,13 +5,11 @@ use rayon::prelude::*; use sourmash::index::revindex::{RevIndex, RevIndexOps}; use sourmash::selection::Selection; use sourmash::signature::SigsTrait; -use sourmash::sketch::Sketch; use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{ - csvwriter_thread, is_revindex_database, load_collection, open_stdout_or_file, ReportType, - SearchResult, + csvwriter_thread, is_revindex_database, load_collection, ReportType, SearchResult, }; pub fn mastiff_manysearch( @@ -55,46 +53,44 @@ pub fn mastiff_manysearch( let send_result = query_collection .par_iter() - .filter_map(|(idx, record)| { + .filter_map(|(_idx, record)| { let i = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst); if i % 1000 == 0 { eprintln!("Processed {} search sigs", i); } let mut results = vec![]; - match query_collection.sig_for_dataset(idx) { + // query downsample happens here + match query_collection.sig_from_record(record) { Ok(query_sig) => { - for sketch in query_sig.iter() { - if let Sketch::MinHash(query_mh) = sketch { - // let location = query_sig.filename(); - let query_size = query_mh.size(); - let counter = db.counter_for_query(&query_mh); - let matches = - db.matches_from_counter(counter, minimum_containment as usize); - - // filter the matches for containment - for (path, overlap) in matches { - let containment = overlap as f64 / query_size as f64; - if containment >= minimum_containment { - results.push(SearchResult { - query_name: query_sig.name(), - query_md5: query_sig.md5sum(), - match_name: path.clone(), - containment, - intersect_hashes: overlap, - match_md5: None, - jaccard: None, - max_containment: None, - }); - } + if let Some(query_mh) = query_sig.minhash() { + let query_size = query_mh.size(); + let counter = db.counter_for_query(&query_mh); + let matches = + db.matches_from_counter(counter, minimum_containment as usize); + + // filter the matches for containment + for (path, overlap) in matches { + let containment = overlap as f64 / query_size as f64; + if containment >= minimum_containment { + results.push(SearchResult { + query_name: query_sig.name(), + query_md5: query_sig.md5sum(), + match_name: path.clone(), + containment, + intersect_hashes: overlap, + match_md5: None, + jaccard: None, + max_containment: None, + }); } - } else { - eprintln!( - "WARNING: no compatible sketches in path '{}'", - query_sig.filename() - ); - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } + } else { + eprintln!( + "WARNING: no compatible sketches in path '{}'", + query_sig.filename() + ); + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } if results.is_empty() { None diff --git a/src/multisearch.rs b/src/multisearch.rs index ad28c6ab..0ecb6fdf 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -1,17 +1,14 @@ -use anyhow::Result; /// multisearch: massively parallel in-memory sketch search. +use anyhow::Result; use rayon::prelude::*; - -use std::fs::File; -use std::io::{BufWriter, Write}; - -use std::sync::atomic; -use std::sync::atomic::AtomicUsize; - use sourmash::selection::Selection; use sourmash::signature::SigsTrait; +use std::sync::atomic; +use std::sync::atomic::AtomicUsize; -use crate::utils::{load_collection, load_mh_with_name_and_md5, ReportType}; +use crate::utils::{ + csvwriter_thread, load_collection, load_mh_with_name_and_md5, MultiSearchResult, ReportType, +}; /// Search many queries against a list of signatures. /// @@ -48,25 +45,11 @@ pub fn multisearch( load_mh_with_name_and_md5(against_collection, &selection, ReportType::Against).unwrap(); // set up a multi-producer, single-consumer channel. - let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); - - // & spawn a thread that is dedicated to printing to a buffered output - let out: Box = match output { - Some(path) => Box::new(BufWriter::new(File::create(path).unwrap())), - None => Box::new(std::io::stdout()), - }; - let thrd = std::thread::spawn(move || { - let mut writer = BufWriter::new(out); - writeln!(&mut writer, "query_name,query_md5,match_name,match_md5,containment,max_containment,jaccard,intersect_hashes").unwrap(); - for (query, query_md5, m, m_md5, cont, max_cont, jaccard, overlap) in recv.into_iter() { - writeln!( - &mut writer, - "\"{}\",{},\"{}\",{},{},{},{},{}", - query, query_md5, m, m_md5, cont, max_cont, jaccard, overlap - ) - .ok(); - } - }); + let (send, recv) = + std::sync::mpsc::sync_channel::(rayon::current_num_threads()); + + // // & spawn a thread that is dedicated to printing to a buffered output + let thrd = csvwriter_thread(recv, output); // // Main loop: iterate (in parallel) over all search signature paths, @@ -98,16 +81,18 @@ pub fn multisearch( let jaccard = overlap / (target_size + query_size - overlap); if containment_query_in_target > threshold { - results.push(( - query_name.clone(), - query_md5.clone(), - against_name.clone(), - against_md5.clone(), - containment_query_in_target, - max_containment, - jaccard, - overlap, - )) + results.push( + (MultiSearchResult { + query_name: query_name.clone(), + query_md5: query_md5.clone(), + match_name: against_name.clone(), + match_md5: against_md5.clone(), + containment: containment_query_in_target, + max_containment: max_containment, + jaccard: jaccard, + intersect_hashes: overlap, + }), + ) } } if results.is_empty() { diff --git a/src/pairwise.rs b/src/pairwise.rs index c714f9c8..fa61e0de 100644 --- a/src/pairwise.rs +++ b/src/pairwise.rs @@ -1,28 +1,24 @@ -use anyhow::Result; /// pairwise: massively parallel in-memory pairwise comparisons. +use anyhow::Result; use rayon::prelude::*; - -use std::fs::File; -use std::io::{BufWriter, Write}; -use std::path::Path; - use std::sync::atomic; use std::sync::atomic::AtomicUsize; -use sourmash::signature::SigsTrait; - -use crate::utils::{load_collection, load_mh_with_name_and_md5, ReportType}; +use crate::utils::{ + csvwriter_thread, load_collection, load_mh_with_name_and_md5, MultiSearchResult, ReportType, +}; use sourmash::selection::Selection; +use sourmash::signature::SigsTrait; /// Perform pairwise comparisons of all signatures in a list. /// /// Note: this function loads all _signatures_ into memory. -pub fn pairwise>( +pub fn pairwise( siglist: String, threshold: f64, selection: &Selection, - output: Option

, + output: Option, allow_failed_sigpaths: bool, ) -> Result<(), Box> { // Load all sigs into memory at once. @@ -42,25 +38,11 @@ pub fn pairwise>( let sketches = load_mh_with_name_and_md5(collection, &selection, ReportType::General).unwrap(); // set up a multi-producer, single-consumer channel. - let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); - - // & spawn a thread that is dedicated to printing to a buffered output - let out: Box = match output { - Some(path) => Box::new(BufWriter::new(File::create(path).unwrap())), - None => Box::new(std::io::stdout()), - }; - let thrd = std::thread::spawn(move || { - let mut writer = BufWriter::new(out); - writeln!(&mut writer, "query_name,query_md5,match_name,match_md5,containment,max_containment,jaccard,intersect_hashes").unwrap(); - for (query, query_md5, m, m_md5, cont, max_cont, jaccard, overlap) in recv.into_iter() { - writeln!( - &mut writer, - "\"{}\",{},\"{}\",{},{},{},{},{}", - query, query_md5, m, m_md5, cont, max_cont, jaccard, overlap - ) - .ok(); - } - }); + let (send, recv) = + std::sync::mpsc::sync_channel::(rayon::current_num_threads()); + + // // & spawn a thread that is dedicated to printing to a buffered output + let thrd = csvwriter_thread(recv, output); // // Main loop: iterate (in parallel) over all signature, @@ -83,16 +65,18 @@ pub fn pairwise>( let jaccard = overlap / (query1_size + query2_size - overlap); if containment_q1_in_q2 > threshold || containment_q2_in_q1 > threshold { - send.send(( - q1_name.clone(), - q1_md5.clone(), - q2_name.clone(), - q2_md5.clone(), - containment_q1_in_q2, - max_containment, - jaccard, - overlap, - )) + send.send( + (MultiSearchResult { + query_name: q1_name.clone(), + query_md5: q1_md5.clone(), + match_name: q2_name.clone(), + match_md5: q2_md5.clone(), + containment: containment_q1_in_q2, + max_containment: max_containment, + jaccard: jaccard, + intersect_hashes: overlap, + }), + ) .unwrap(); } diff --git a/src/utils.rs b/src/utils.rs index 1cbdc41a..00405718 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -644,6 +644,44 @@ impl ResultType for BranchwaterGatherResult { } } +pub struct MultiSearchResult { + pub query_name: String, + pub query_md5: String, + pub match_name: String, + pub match_md5: String, + pub containment: f64, + pub max_containment: f64, + pub jaccard: f64, + pub intersect_hashes: f64, +} + +impl ResultType for MultiSearchResult { + fn header_fields() -> Vec<&'static str> { + vec![ + "query_name", + "query_md5", + "match_name", + "match_md5", + "containment", + "max_containment", + "jaccard", + "intersect_hashes", + ] + } + + fn format_fields(&self) -> Vec { + vec![ + format!("\"{}\"", self.query_name), // Wrap query_name with quotes + self.query_md5.clone(), + format!("\"{}\"", self.match_name), // Wrap match_name with quotes + self.match_md5.clone(), + self.containment.to_string(), + self.max_containment.to_string(), + self.jaccard.to_string(), + self.intersect_hashes.to_string(), + ] + } +} pub struct ManifestRow { pub md5: String, pub md5short: String, From 14af130add291b35f5e0f4914de52a7d0e24ae68 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 1 Feb 2024 21:44:36 -0800 Subject: [PATCH 36/47] apply clippy suggestions --- src/fastgather.rs | 3 +-- src/fastmultigather.rs | 6 +++--- src/index.rs | 2 +- src/manysearch.rs | 13 ++++--------- src/manysketch.rs | 2 +- src/mastiff_manygather.rs | 22 ++++++++++------------ src/mastiff_manysearch.rs | 2 +- src/multisearch.rs | 28 +++++++++++++--------------- src/pairwise.rs | 26 ++++++++++++-------------- src/utils.rs | 21 ++++++--------------- 10 files changed, 52 insertions(+), 73 deletions(-) diff --git a/src/fastgather.rs b/src/fastgather.rs index ab9a55a8..f70b11e3 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -68,8 +68,7 @@ pub fn fastgather( ); // load a set of sketches, filtering for those with overlaps > threshold - let result = - load_sketches_above_threshold(against_collection, &selection, &query_mh, threshold_hashes)?; + let result = load_sketches_above_threshold(against_collection, query_mh, threshold_hashes)?; let matchlist = result.0; let skipped_paths = result.1; let failed_paths = result.2; diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index dc10e897..a91c33d5 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -54,7 +54,7 @@ pub fn fastmultigather( )?; // load against sketches into memory, downsampling on the way let against = - load_mh_with_name_and_md5(against_collection, &selection, ReportType::Against).unwrap(); + load_mh_with_name_and_md5(against_collection, selection, ReportType::Against).unwrap(); // Iterate over all queries => do prefetch and gather! let processed_queries = AtomicUsize::new(0); @@ -73,8 +73,8 @@ pub fn fastmultigather( let matchlist: BinaryHeap = against .iter() .filter_map(|(against_mh, against_name, against_md5)| { - let mut mm = None; - if let Ok(overlap) = against_mh.count_common(&query_mh, false) { + let mut mm: Option = None; + if let Ok(overlap) = against_mh.count_common(query_mh, false) { if overlap >= threshold_hashes { let result = PrefetchResult { name: against_name.clone(), diff --git a/src/index.rs b/src/index.rs index 0ed0a230..3747e6f5 100644 --- a/src/index.rs +++ b/src/index.rs @@ -22,7 +22,7 @@ pub fn index>( RevIndex::create( output.as_ref(), - collection.select(&selection)?.try_into()?, + collection.select(selection)?.try_into()?, colors, )?; diff --git a/src/manysearch.rs b/src/manysearch.rs index 767bb7d2..1ffd7c28 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -5,19 +5,14 @@ /// database once. use anyhow::Result; use rayon::prelude::*; - -use sourmash::prelude::Select; -use sourmash::selection::Selection; -use sourmash::signature::SigsTrait; -use sourmash::sketch::Sketch; -use sourmash::storage::SigStore; - use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{ csvwriter_thread, load_collection, load_mh_with_name_and_md5, ReportType, SearchResult, }; +use sourmash::selection::Selection; +use sourmash::signature::SigsTrait; pub fn manysearch( query_filepath: String, @@ -36,7 +31,7 @@ pub fn manysearch( )?; // load all query sketches into memory, downsampling on the way let query_sketchlist = - load_mh_with_name_and_md5(query_collection, &selection, ReportType::Query).unwrap(); + load_mh_with_name_and_md5(query_collection, selection, ReportType::Query).unwrap(); // Against: Load all _paths_, not signatures, into memory. let against_collection = load_collection( @@ -77,7 +72,7 @@ pub fn manysearch( Ok(against_sig) => { if let Some(against_mh) = against_sig.minhash() { for (query_mh, query_name, query_md5) in query_sketchlist.iter() { - let overlap = query_mh.count_common(&against_mh, false).unwrap() as f64; + let overlap = query_mh.count_common(against_mh, false).unwrap() as f64; let query_size = query_mh.size() as f64; let target_size = against_mh.size() as f64; let containment_query_in_target = overlap / query_size; diff --git a/src/manysketch.rs b/src/manysketch.rs index 1fbe399d..a4eefc7a 100644 --- a/src/manysketch.rs +++ b/src/manysketch.rs @@ -158,7 +158,7 @@ pub fn manysketch( let send = std::sync::Arc::new(send); // & spawn a thread that is dedicated to printing to a buffered output - let thrd = sigwriter::<&str>(recv, output); + let thrd = sigwriter(recv, output); // parse param string into params_vec, print error if fail let param_result = parse_params_str(param_str); diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 8f19307e..cb794735 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -62,29 +62,27 @@ pub fn mastiff_manygather( if let Some(query_mh) = query_sig.minhash() { // Gather! let (counter, query_colors, hash_to_color) = - db.prepare_gather_counters(&query_mh); + db.prepare_gather_counters(query_mh); let matches = db.gather( counter, query_colors, hash_to_color, threshold, - &query_mh, + query_mh, Some(selection.clone()), ); // extract results TODO: ADD REST OF GATHER COLUMNS if let Ok(matches) = matches { for match_ in &matches { - results.push( - (BranchwaterGatherResult { - query_name: query_sig.name().clone(), - query_md5: query_sig.md5sum().clone(), - match_name: match_.name().clone(), - match_md5: match_.md5().clone(), - f_match_query: match_.f_match(), - intersect_bp: match_.intersect_bp(), - }), - ); + results.push(BranchwaterGatherResult { + query_name: query_sig.name().clone(), + query_md5: query_sig.md5sum().clone(), + match_name: match_.name().clone(), + match_md5: match_.md5().clone(), + f_match_query: match_.f_match(), + intersect_bp: match_.intersect_bp(), + }); } } else { eprintln!("Error gathering matches: {:?}", matches.err()); diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index cc5efd57..0b7c163d 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -65,7 +65,7 @@ pub fn mastiff_manysearch( Ok(query_sig) => { if let Some(query_mh) = query_sig.minhash() { let query_size = query_mh.size(); - let counter = db.counter_for_query(&query_mh); + let counter = db.counter_for_query(query_mh); let matches = db.matches_from_counter(counter, minimum_containment as usize); diff --git a/src/multisearch.rs b/src/multisearch.rs index 0ecb6fdf..9e2fe6d7 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -32,7 +32,7 @@ pub fn multisearch( allow_failed_sigpaths, )?; let queries = - load_mh_with_name_and_md5(query_collection, &selection, ReportType::Query).unwrap(); + load_mh_with_name_and_md5(query_collection, selection, ReportType::Query).unwrap(); // Load all against sketches into memory at once. let against_collection = load_collection( @@ -42,7 +42,7 @@ pub fn multisearch( allow_failed_sigpaths, )?; let against = - load_mh_with_name_and_md5(against_collection, &selection, ReportType::Against).unwrap(); + load_mh_with_name_and_md5(against_collection, selection, ReportType::Against).unwrap(); // set up a multi-producer, single-consumer channel. let (send, recv) = @@ -70,7 +70,7 @@ pub fn multisearch( eprintln!("Processed {} comparisons", i); } - let overlap = query_mh.count_common(&against_mh, false).unwrap() as f64; + let overlap = query_mh.count_common(against_mh, false).unwrap() as f64; // use downsampled sizes let query_size = query_mh.size() as f64; let target_size = against_mh.size() as f64; @@ -81,18 +81,16 @@ pub fn multisearch( let jaccard = overlap / (target_size + query_size - overlap); if containment_query_in_target > threshold { - results.push( - (MultiSearchResult { - query_name: query_name.clone(), - query_md5: query_md5.clone(), - match_name: against_name.clone(), - match_md5: against_md5.clone(), - containment: containment_query_in_target, - max_containment: max_containment, - jaccard: jaccard, - intersect_hashes: overlap, - }), - ) + results.push(MultiSearchResult { + query_name: query_name.clone(), + query_md5: query_md5.clone(), + match_name: against_name.clone(), + match_md5: against_md5.clone(), + containment: containment_query_in_target, + max_containment, + jaccard, + intersect_hashes: overlap, + }) } } if results.is_empty() { diff --git a/src/pairwise.rs b/src/pairwise.rs index fa61e0de..fbfac585 100644 --- a/src/pairwise.rs +++ b/src/pairwise.rs @@ -35,7 +35,7 @@ pub fn pairwise( &siglist ) } - let sketches = load_mh_with_name_and_md5(collection, &selection, ReportType::General).unwrap(); + let sketches = load_mh_with_name_and_md5(collection, selection, ReportType::General).unwrap(); // set up a multi-producer, single-consumer channel. let (send, recv) = @@ -54,7 +54,7 @@ pub fn pairwise( .par_iter() .enumerate() .for_each(|(idx, (q1, q1_name, q1_md5))| { - for (j, (q2, q2_name, q2_md5)) in sketches.iter().enumerate().skip(idx + 1) { + for (q2, q2_name, q2_md5) in sketches.iter().skip(idx + 1) { let overlap = q1.count_common(q2, false).unwrap() as f64; let query1_size = q1.size() as f64; let query2_size = q2.size() as f64; @@ -65,18 +65,16 @@ pub fn pairwise( let jaccard = overlap / (query1_size + query2_size - overlap); if containment_q1_in_q2 > threshold || containment_q2_in_q1 > threshold { - send.send( - (MultiSearchResult { - query_name: q1_name.clone(), - query_md5: q1_md5.clone(), - match_name: q2_name.clone(), - match_md5: q2_md5.clone(), - containment: containment_q1_in_q2, - max_containment: max_containment, - jaccard: jaccard, - intersect_hashes: overlap, - }), - ) + send.send(MultiSearchResult { + query_name: q1_name.clone(), + query_md5: q1_md5.clone(), + match_name: q2_name.clone(), + match_md5: q2_md5.clone(), + containment: containment_q1_in_q2, + max_containment, + jaccard, + intersect_hashes: overlap, + }) .unwrap(); } diff --git a/src/utils.rs b/src/utils.rs index 00405718..06f1ddb6 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -4,26 +4,22 @@ use sourmash::encodings::HashFunctions; use sourmash::manifest::Manifest; use sourmash::selection::Select; +use anyhow::{anyhow, Result}; use camino::Utf8Path as Path; use camino::Utf8PathBuf as PathBuf; +use std::cmp::{Ordering, PartialOrd}; +use std::collections::BinaryHeap; use std::fs::{create_dir_all, File}; use std::io::{BufRead, BufReader, BufWriter, Write}; use std::panic; - use std::sync::atomic; use std::sync::atomic::AtomicUsize; -use std::collections::BinaryHeap; - -use anyhow::{anyhow, Result}; -use std::cmp::{Ordering, PartialOrd}; - use sourmash::collection::Collection; use sourmash::manifest::Record; use sourmash::selection::Selection; use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::minhash::KmerMinHash; -use sourmash::sketch::Sketch; use sourmash::storage::{FSStorage, InnerStorage, SigStore}; /// Structure to hold overlap information from comparisons. @@ -208,7 +204,7 @@ pub fn load_mh_with_name_and_md5( let mut sketchinfo: Vec<(KmerMinHash, String, String)> = Vec::new(); for (_idx, record) in collection.iter() { if let Ok(sig) = collection.sig_from_record(record) { - if let Some(ds_mh) = sig.clone().select(&selection)?.minhash().cloned() { + if let Some(ds_mh) = sig.clone().select(selection)?.minhash().cloned() { sketchinfo.push((ds_mh, record.name().to_string(), record.md5().to_string())); } } else { @@ -227,7 +223,6 @@ pub fn load_mh_with_name_and_md5( pub fn load_sketches_above_threshold( against_collection: Collection, - selection: &Selection, query: &KmerMinHash, threshold_hashes: u64, ) -> Result<(BinaryHeap, usize, usize)> { @@ -475,11 +470,7 @@ pub fn consume_query_by_gather( // let location = query.location; let location = query.filename(); // this is different (original fasta filename) than query.location was (sig name)!! - let sketches = query.sketches(); - let orig_query_mh = match sketches.get(0) { - Some(Sketch::MinHash(mh)) => Ok(mh), - _ => Err(anyhow::anyhow!("No MinHash found")), - }?; + let orig_query_mh = query.minhash().unwrap(); let mut query_mh = orig_query_mh.clone(); let mut last_hashes = orig_query_mh.size(); @@ -821,7 +812,7 @@ pub enum ZipMessage { WriteManifest, } -pub fn sigwriter + Send + 'static>( +pub fn sigwriter( recv: std::sync::mpsc::Receiver, output: String, ) -> std::thread::JoinHandle> { From 13c96d12329a40ae6b736ae5aa697fa0aabdbadb Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 2 Feb 2024 09:53:16 -0800 Subject: [PATCH 37/47] add back SmallSignature and use --- src/fastmultigather.rs | 10 +++---- src/manysearch.rs | 11 ++++---- src/multisearch.rs | 18 ++++++------ src/pairwise.rs | 63 ++++++++++++++++++++---------------------- src/utils.rs | 21 +++++++++++--- 5 files changed, 67 insertions(+), 56 deletions(-) diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index a91c33d5..4f61e89c 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -72,14 +72,14 @@ pub fn fastmultigather( if let Some(query_mh) = query_sig.minhash() { let matchlist: BinaryHeap = against .iter() - .filter_map(|(against_mh, against_name, against_md5)| { + .filter_map(|(against)| { let mut mm: Option = None; - if let Ok(overlap) = against_mh.count_common(query_mh, false) { + if let Ok(overlap) = against.minhash.count_common(query_mh, false) { if overlap >= threshold_hashes { let result = PrefetchResult { - name: against_name.clone(), - md5sum: against_md5.clone(), - minhash: against_mh.clone(), + name: against.name.clone(), + md5sum: against.md5sum.clone(), + minhash: against.minhash.clone(), overlap, }; mm = Some(result); diff --git a/src/manysearch.rs b/src/manysearch.rs index 1ffd7c28..b1546c05 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -71,9 +71,10 @@ pub fn manysearch( match against_collection.sig_from_record(record) { Ok(against_sig) => { if let Some(against_mh) = against_sig.minhash() { - for (query_mh, query_name, query_md5) in query_sketchlist.iter() { - let overlap = query_mh.count_common(against_mh, false).unwrap() as f64; - let query_size = query_mh.size() as f64; + for query in query_sketchlist.iter() { + let overlap = + query.minhash.count_common(against_mh, false).unwrap() as f64; + let query_size = query.minhash.size() as f64; let target_size = against_mh.size() as f64; let containment_query_in_target = overlap / query_size; let containment_in_target = overlap / target_size; @@ -83,8 +84,8 @@ pub fn manysearch( if containment_query_in_target > threshold { results.push(SearchResult { - query_name: query_name.clone(), - query_md5: query_md5.clone(), + query_name: query.name.clone(), + query_md5: query.md5sum.clone(), match_name: against_sig.name(), containment: containment_query_in_target, intersect_hashes: overlap as usize, diff --git a/src/multisearch.rs b/src/multisearch.rs index 9e2fe6d7..55ccc54c 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -61,19 +61,19 @@ pub fn multisearch( let send = against .par_iter() - .filter_map(|(against_mh, against_name, against_md5)| { + .filter_map(|(against)| { let mut results = vec![]; // search for matches & save containment. - for (query_mh, query_name, query_md5) in queries.iter() { + for query in queries.iter() { let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); if i % 100000 == 0 { eprintln!("Processed {} comparisons", i); } - let overlap = query_mh.count_common(against_mh, false).unwrap() as f64; + let overlap = query.minhash.count_common(&against.minhash, false).unwrap() as f64; // use downsampled sizes - let query_size = query_mh.size() as f64; - let target_size = against_mh.size() as f64; + let query_size = query.minhash.size() as f64; + let target_size = against.minhash.size() as f64; let containment_query_in_target = overlap / query_size; let containment_in_target = overlap / target_size; @@ -82,10 +82,10 @@ pub fn multisearch( if containment_query_in_target > threshold { results.push(MultiSearchResult { - query_name: query_name.clone(), - query_md5: query_md5.clone(), - match_name: against_name.clone(), - match_md5: against_md5.clone(), + query_name: query.name.clone(), + query_md5: query.md5sum.clone(), + match_name: against.name.clone(), + match_md5: against.md5sum.clone(), containment: containment_query_in_target, max_containment, jaccard, diff --git a/src/pairwise.rs b/src/pairwise.rs index fbfac585..e206bf2b 100644 --- a/src/pairwise.rs +++ b/src/pairwise.rs @@ -50,40 +50,37 @@ pub fn pairwise( let processed_cmp = AtomicUsize::new(0); - sketches - .par_iter() - .enumerate() - .for_each(|(idx, (q1, q1_name, q1_md5))| { - for (q2, q2_name, q2_md5) in sketches.iter().skip(idx + 1) { - let overlap = q1.count_common(q2, false).unwrap() as f64; - let query1_size = q1.size() as f64; - let query2_size = q2.size() as f64; - - let containment_q1_in_q2 = overlap / query1_size; - let containment_q2_in_q1 = overlap / query2_size; - let max_containment = containment_q1_in_q2.max(containment_q2_in_q1); - let jaccard = overlap / (query1_size + query2_size - overlap); - - if containment_q1_in_q2 > threshold || containment_q2_in_q1 > threshold { - send.send(MultiSearchResult { - query_name: q1_name.clone(), - query_md5: q1_md5.clone(), - match_name: q2_name.clone(), - match_md5: q2_md5.clone(), - containment: containment_q1_in_q2, - max_containment, - jaccard, - intersect_hashes: overlap, - }) - .unwrap(); - } - - let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); - if i % 100000 == 0 { - eprintln!("Processed {} comparisons", i); - } + sketches.par_iter().enumerate().for_each(|(idx, query)| { + for against in sketches.iter().skip(idx + 1) { + let overlap = query.minhash.count_common(&against.minhash, false).unwrap() as f64; + let query1_size = query.minhash.size() as f64; + let query2_size = against.minhash.size() as f64; + + let containment_q1_in_q2 = overlap / query1_size; + let containment_q2_in_q1 = overlap / query2_size; + let max_containment = containment_q1_in_q2.max(containment_q2_in_q1); + let jaccard = overlap / (query1_size + query2_size - overlap); + + if containment_q1_in_q2 > threshold || containment_q2_in_q1 > threshold { + send.send(MultiSearchResult { + query_name: query.name.clone(), + query_md5: query.md5sum.clone(), + match_name: against.name.clone(), + match_md5: against.md5sum.clone(), + containment: containment_q1_in_q2, + max_containment, + jaccard, + intersect_hashes: overlap, + }) + .unwrap(); } - }); + + let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); + if i % 100000 == 0 { + eprintln!("Processed {} comparisons", i); + } + } + }); // do some cleanup and error handling - drop(send); // close the channel diff --git a/src/utils.rs b/src/utils.rs index 06f1ddb6..8e664412 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -22,6 +22,14 @@ use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::minhash::KmerMinHash; use sourmash::storage::{FSStorage, InnerStorage, SigStore}; +/// Track a name/minhash. + +pub struct SmallSignature { + pub location: String, + pub name: String, + pub md5sum: String, + pub minhash: KmerMinHash, +} /// Structure to hold overlap information from comparisons. pub struct PrefetchResult { @@ -200,12 +208,17 @@ pub fn load_mh_with_name_and_md5( collection: Collection, selection: &Selection, report_type: ReportType, -) -> Result> { - let mut sketchinfo: Vec<(KmerMinHash, String, String)> = Vec::new(); +) -> Result> { + let mut sketchinfo: Vec = Vec::new(); for (_idx, record) in collection.iter() { if let Ok(sig) = collection.sig_from_record(record) { - if let Some(ds_mh) = sig.clone().select(selection)?.minhash().cloned() { - sketchinfo.push((ds_mh, record.name().to_string(), record.md5().to_string())); + if let Some(minhash) = sig.clone().select(selection)?.minhash().cloned() { + sketchinfo.push(SmallSignature { + location: record.internal_location().to_string(), + name: sig.name(), + md5sum: sig.md5sum(), + minhash, + }) } } else { bail!( From 2453c9b4912bc50bdad3534700a2aee444512c7d Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 2 Feb 2024 09:57:02 -0800 Subject: [PATCH 38/47] rename fn back to load_sketches --- src/fastmultigather.rs | 7 +++---- src/manysearch.rs | 7 ++----- src/multisearch.rs | 8 +++----- src/pairwise.rs | 4 ++-- src/utils.rs | 2 +- 5 files changed, 11 insertions(+), 17 deletions(-) diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 4f61e89c..91e57e23 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -12,8 +12,8 @@ use std::collections::BinaryHeap; use camino::Utf8Path as PathBuf; use crate::utils::{ - consume_query_by_gather, load_collection, load_mh_with_name_and_md5, write_prefetch, - PrefetchResult, ReportType, + consume_query_by_gather, load_collection, load_sketches, write_prefetch, PrefetchResult, + ReportType, }; pub fn fastmultigather( @@ -53,8 +53,7 @@ pub fn fastmultigather( allow_failed_sigpaths, )?; // load against sketches into memory, downsampling on the way - let against = - load_mh_with_name_and_md5(against_collection, selection, ReportType::Against).unwrap(); + let against = load_sketches(against_collection, selection, ReportType::Against).unwrap(); // Iterate over all queries => do prefetch and gather! let processed_queries = AtomicUsize::new(0); diff --git a/src/manysearch.rs b/src/manysearch.rs index b1546c05..d7ff7808 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -8,9 +8,7 @@ use rayon::prelude::*; use std::sync::atomic; use std::sync::atomic::AtomicUsize; -use crate::utils::{ - csvwriter_thread, load_collection, load_mh_with_name_and_md5, ReportType, SearchResult, -}; +use crate::utils::{csvwriter_thread, load_collection, load_sketches, ReportType, SearchResult}; use sourmash::selection::Selection; use sourmash::signature::SigsTrait; @@ -30,8 +28,7 @@ pub fn manysearch( allow_failed_sigpaths, )?; // load all query sketches into memory, downsampling on the way - let query_sketchlist = - load_mh_with_name_and_md5(query_collection, selection, ReportType::Query).unwrap(); + let query_sketchlist = load_sketches(query_collection, selection, ReportType::Query).unwrap(); // Against: Load all _paths_, not signatures, into memory. let against_collection = load_collection( diff --git a/src/multisearch.rs b/src/multisearch.rs index 55ccc54c..569d9f7d 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -7,7 +7,7 @@ use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{ - csvwriter_thread, load_collection, load_mh_with_name_and_md5, MultiSearchResult, ReportType, + csvwriter_thread, load_collection, load_sketches, MultiSearchResult, ReportType, }; /// Search many queries against a list of signatures. @@ -31,8 +31,7 @@ pub fn multisearch( ReportType::Query, allow_failed_sigpaths, )?; - let queries = - load_mh_with_name_and_md5(query_collection, selection, ReportType::Query).unwrap(); + let queries = load_sketches(query_collection, selection, ReportType::Query).unwrap(); // Load all against sketches into memory at once. let against_collection = load_collection( @@ -41,8 +40,7 @@ pub fn multisearch( ReportType::Against, allow_failed_sigpaths, )?; - let against = - load_mh_with_name_and_md5(against_collection, selection, ReportType::Against).unwrap(); + let against = load_sketches(against_collection, selection, ReportType::Against).unwrap(); // set up a multi-producer, single-consumer channel. let (send, recv) = diff --git a/src/pairwise.rs b/src/pairwise.rs index e206bf2b..aca9f797 100644 --- a/src/pairwise.rs +++ b/src/pairwise.rs @@ -5,7 +5,7 @@ use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{ - csvwriter_thread, load_collection, load_mh_with_name_and_md5, MultiSearchResult, ReportType, + csvwriter_thread, load_collection, load_sketches, MultiSearchResult, ReportType, }; use sourmash::selection::Selection; use sourmash::signature::SigsTrait; @@ -35,7 +35,7 @@ pub fn pairwise( &siglist ) } - let sketches = load_mh_with_name_and_md5(collection, selection, ReportType::General).unwrap(); + let sketches = load_sketches(collection, selection, ReportType::General).unwrap(); // set up a multi-producer, single-consumer channel. let (send, recv) = diff --git a/src/utils.rs b/src/utils.rs index 8e664412..ed4a2606 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -204,7 +204,7 @@ pub fn load_fasta_fromfile(sketchlist_filename: String) -> Result Date: Fri, 2 Feb 2024 11:54:23 -0800 Subject: [PATCH 39/47] use serde serialize for writing instead of custom traits --- src/fastgather.rs | 7 +- src/fastmultigather.rs | 2 +- src/multisearch.rs | 2 +- src/utils.rs | 202 +++++++++-------------------------------- 4 files changed, 46 insertions(+), 167 deletions(-) diff --git a/src/fastgather.rs b/src/fastgather.rs index f70b11e3..349ed974 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -1,10 +1,7 @@ /// fastgather: Run gather with a query against a list of files. use anyhow::Result; - -use sourmash::selection::Selection; -// use camino; - use sourmash::prelude::Select; +use sourmash::selection::Selection; use crate::utils::{ consume_query_by_gather, load_collection, load_sketches_above_threshold, write_prefetch, @@ -43,7 +40,7 @@ pub fn fastgather( bail!("No query sketch matching selection parameters."); } }; - // build the list of paths to match against. + // load collection to match against. let against_collection = load_collection( &against_filepath, selection, diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 91e57e23..1ed14f10 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -71,7 +71,7 @@ pub fn fastmultigather( if let Some(query_mh) = query_sig.minhash() { let matchlist: BinaryHeap = against .iter() - .filter_map(|(against)| { + .filter_map(|against| { let mut mm: Option = None; if let Ok(overlap) = against.minhash.count_common(query_mh, false) { if overlap >= threshold_hashes { diff --git a/src/multisearch.rs b/src/multisearch.rs index 569d9f7d..c4f33843 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -59,7 +59,7 @@ pub fn multisearch( let send = against .par_iter() - .filter_map(|(against)| { + .filter_map(|against| { let mut results = vec![]; // search for matches & save containment. for query in queries.iter() { diff --git a/src/utils.rs b/src/utils.rs index ed4a2606..4d1cc244 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -7,6 +7,9 @@ use sourmash::selection::Select; use anyhow::{anyhow, Result}; use camino::Utf8Path as Path; use camino::Utf8PathBuf as PathBuf; +use csv::Writer; +use serde::ser::Serializer; +use serde::Serialize; use std::cmp::{Ordering, PartialOrd}; use std::collections::BinaryHeap; use std::fs::{create_dir_all, File}; @@ -568,6 +571,7 @@ pub fn is_revindex_database(path: &camino::Utf8PathBuf) -> bool { } } +#[derive(Serialize)] pub struct SearchResult { pub query_name: String, pub query_md5: String, @@ -579,43 +583,7 @@ pub struct SearchResult { pub max_containment: Option, } -impl ResultType for SearchResult { - fn header_fields() -> Vec<&'static str> { - vec![ - "query_name", - "query_md5", - "match_name", - "containment", - "intersect_hashes", - "match_md5", - "jaccard", - "max_containment", - ] - } - - fn format_fields(&self) -> Vec { - vec![ - format!("\"{}\"", self.query_name), // Wrap query_name with quotes - self.query_md5.clone(), - format!("\"{}\"", self.match_name), // Wrap match_name with quotes - self.containment.to_string(), - self.intersect_hashes.to_string(), - match &self.match_md5 { - Some(md5) => md5.clone(), - None => "".to_string(), - }, - match &self.jaccard { - Some(jaccard) => jaccard.to_string(), - None => "".to_string(), - }, - match &self.max_containment { - Some(max_containment) => max_containment.to_string(), - None => "".to_string(), - }, - ] - } -} - +#[derive(Serialize)] pub struct BranchwaterGatherResult { pub query_name: String, pub query_md5: String, @@ -625,29 +593,7 @@ pub struct BranchwaterGatherResult { pub intersect_bp: usize, } -impl ResultType for BranchwaterGatherResult { - fn header_fields() -> Vec<&'static str> { - vec![ - "query_name", - "query_md5", - "match_name", - "match_md5", - "f_match_query", - "intersect_bp", - ] - } - fn format_fields(&self) -> Vec { - vec![ - format!("\"{}\"", self.query_name), // Wrap query_name with quotes - self.query_md5.clone(), - format!("\"{}\"", self.match_name), // Wrap match_name with quotes - self.match_md5.clone(), - self.f_match_query.to_string(), - self.intersect_bp.to_string(), - ] - } -} - +#[derive(Serialize)] pub struct MultiSearchResult { pub query_name: String, pub query_md5: String, @@ -659,33 +605,7 @@ pub struct MultiSearchResult { pub intersect_hashes: f64, } -impl ResultType for MultiSearchResult { - fn header_fields() -> Vec<&'static str> { - vec![ - "query_name", - "query_md5", - "match_name", - "match_md5", - "containment", - "max_containment", - "jaccard", - "intersect_hashes", - ] - } - - fn format_fields(&self) -> Vec { - vec![ - format!("\"{}\"", self.query_name), // Wrap query_name with quotes - self.query_md5.clone(), - format!("\"{}\"", self.match_name), // Wrap match_name with quotes - self.match_md5.clone(), - self.containment.to_string(), - self.max_containment.to_string(), - self.jaccard.to_string(), - self.intersect_hashes.to_string(), - ] - } -} +#[derive(Serialize)] pub struct ManifestRow { pub md5: String, pub md5short: String, @@ -694,50 +614,24 @@ pub struct ManifestRow { pub num: u32, pub scaled: u64, pub n_hashes: usize, - pub with_abundance: bool, + pub with_abundance: BoolPython, pub name: String, pub filename: String, pub internal_location: String, } -pub fn bool_to_python_string(b: bool) -> String { - match b { - true => "True".to_string(), - false => "False".to_string(), - } -} - -impl ResultType for ManifestRow { - fn header_fields() -> Vec<&'static str> { - vec![ - "internal_location", - "md5", - "md5short", - "ksize", - "moltype", - "num", - "scaled", - "n_hashes", - "with_abundance", - "name", - "filename", - ] - } +// A wrapper type for booleans to customize serialization +pub struct BoolPython(bool); - fn format_fields(&self) -> Vec { - vec![ - self.internal_location.clone(), - self.md5.clone(), - self.md5short.clone(), - self.ksize.to_string(), - self.moltype.clone(), - self.num.to_string(), - self.scaled.to_string(), - self.n_hashes.to_string(), - bool_to_python_string(self.with_abundance), - format!("\"{}\"", self.name), // Wrap name with quotes - self.filename.clone(), - ] +impl Serialize for BoolPython { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + match self.0 { + true => serializer.serialize_str("True"), + false => serializer.serialize_str("False"), + } } } @@ -771,7 +665,7 @@ pub fn make_manifest_row( num, scaled, n_hashes: sketch.size(), - with_abundance: abund, + with_abundance: BoolPython(abund), name: sig.name().to_string(), filename: filename.to_string(), } @@ -876,24 +770,27 @@ pub fn sigwriter( println!("Writing manifest"); // Start the CSV file inside the zip zip.start_file("SOURMASH-MANIFEST.csv", options).unwrap(); - // write manifest version line writeln!(&mut zip, "# SOURMASH-MANIFEST-VERSION: 1.0").unwrap(); - // Write the header - let header = ManifestRow::header_fields(); - if let Err(e) = writeln!(&mut zip, "{}", header.join(",")) { - eprintln!("Error writing header: {:?}", e); - } + // scoped block for csv writing + { + let mut csv_writer = Writer::from_writer(&mut zip); - // Write each manifest row - for row in &manifest_rows { - let formatted_fields = row.format_fields(); // Assuming you have a format_fields method on ManifestRow - if let Err(e) = writeln!(&mut zip, "{}", formatted_fields.join(",")) { - eprintln!("Error writing item: {:?}", e); + for row in &manifest_rows { + if let Err(e) = csv_writer.serialize(row) { + eprintln!("Error writing item: {:?}", e); + } + } + // CSV writer must be manually flushed to ensure all data is written + if let Err(e) = csv_writer.flush() { + eprintln!("Error flushing CSV writer: {:?}", e); } + } // drop csv writer here + + // Properly finish writing to the ZIP file + if let Err(e) = zip.finish() { + eprintln!("Error finalizing ZIP file: {:?}", e); } - // finalize the zip file writing. - zip.finish().unwrap(); } } } @@ -901,37 +798,22 @@ pub fn sigwriter( }) } -pub trait ResultType { - fn header_fields() -> Vec<&'static str>; - fn format_fields(&self) -> Vec; -} - -pub fn csvwriter_thread( +pub fn csvwriter_thread( recv: std::sync::mpsc::Receiver, output: Option, -) -> std::thread::JoinHandle<()> -where - T: ResultType, -{ +) -> std::thread::JoinHandle<()> { // create output file let out = open_stdout_or_file(output); // spawn a thread that is dedicated to printing to a buffered output std::thread::spawn(move || { - let mut writer = out; - - let header = T::header_fields(); - if let Err(e) = writeln!(&mut writer, "{}", header.join(",")) { - eprintln!("Error writing header: {:?}", e); - } - writer.flush().unwrap(); + let mut writer = Writer::from_writer(out); - for item in recv.iter() { - let formatted_fields = item.format_fields(); - if let Err(e) = writeln!(&mut writer, "{}", formatted_fields.join(",")) { + for res in recv.iter() { + if let Err(e) = writer.serialize(res) { eprintln!("Error writing item: {:?}", e); } - writer.flush().unwrap(); } + writer.flush().expect("Failed to flush writer."); }) } From f6989fdd06ebddd23ab206b6b90f1a242952ceeb Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sat, 10 Feb 2024 12:19:54 -0800 Subject: [PATCH 40/47] upd to 0.12.1 sourmash core; clean up index tests --- Cargo.lock | 21 +++++++++++---------- Cargo.toml | 3 +-- src/python/tests/test_index.py | 15 +-------------- 3 files changed, 13 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0f9f2de5..0e6bc59c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -285,9 +285,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.32" +version = "0.4.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41daef31d7a747c5c847246f36de49ced6f7403b4cdabc807a97b5cc184cda7a" +checksum = "9f13690e35a5e4ace198e7beea2895d29f3a9cc55015fcebe6336bd2010af9eb" dependencies = [ "android-tzdata", "iana-time-zone", @@ -738,9 +738,9 @@ checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" [[package]] name = "memmap2" -version = "0.9.3" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45fd3a57831bf88bc63f8cebc0cf956116276e97fef3966103e96416209f7c92" +checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322" dependencies = [ "libc", ] @@ -1241,9 +1241,9 @@ checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086" [[package]] name = "rkyv" -version = "0.7.43" +version = "0.7.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "527a97cdfef66f65998b5f3b637c26f5a5ec09cc52a3f9932313ac645f4190f5" +checksum = "5cba464629b3394fc4dbc6f940ff8f5b4ff5c7aef40f29166fd4ad12acbc99c0" dependencies = [ "bitvec", "bytecheck", @@ -1259,9 +1259,9 @@ dependencies = [ [[package]] name = "rkyv_derive" -version = "0.7.43" +version = "0.7.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5c462a1328c8e67e4d6dbad1eb0355dd43e8ab432c6e227a43657f16ade5033" +checksum = "a7dddfff8de25e6f62b9d64e6e432bf1c6736c57d20323e15ee10435fbda7c65" dependencies = [ "proc-macro2", "quote", @@ -1395,8 +1395,9 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" -version = "0.12.0" -source = "git+https://github.com/sourmash-bio/sourmash?rev=94b88cc314f781342721addc5ed35c531732a9b6#94b88cc314f781342721addc5ed35c531732a9b6" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa8187a00787432261dc522b6ebf813251dbbeabc04ed7a47f5cbb9be0d4a508" dependencies = [ "az", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index eb8a6a4f..8a94a87a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,8 +12,7 @@ crate-type = ["cdylib"] pyo3 = { version = "0.20.2", features = ["extension-module", "anyhow"] } rayon = "1.8.1" serde = { version = "1.0.196", features = ["derive"] } -sourmash = { git = "https://github.com/sourmash-bio/sourmash", rev= "94b88cc314f781342721addc5ed35c531732a9b6", features = ["branchwater"] } -#sourmash = { version = "0.12.0", features = ["branchwater"] } +sourmash = { version = "0.12.1", features = ["branchwater"] } serde_json = "1.0.113" niffler = "2.4.0" log = "0.4.14" diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index eeb8f76a..663e0ae2 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -128,15 +128,11 @@ def test_index_bad_siglist_2(runtmp, capfd): def test_index_empty_siglist(runtmp, capfd): - ## TODO: index:: do not write output if no signatures to write? - # OR, warn user? - # test empty siglist file siglist = runtmp.output('db-sigs.txt') output = runtmp.output('out.db') make_file_list(siglist, []) # empty - # with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'index', siglist, '-o', output) @@ -145,12 +141,9 @@ def test_index_empty_siglist(runtmp, capfd): print(runtmp.last_result.out) print(runtmp.last_result.err) print(captured.err) - # assert "No signatures to index loaded, exiting." in captured.err def test_index_nomatch_sig_in_siglist(runtmp, capfd): - ## TODO: index:: do not write output if no signatures to write? - # test index with a siglist file that has (only) a non-matching ksize sig siglist = runtmp.output('against.txt') db = runtmp.output('db.rdb') @@ -159,16 +152,14 @@ def test_index_nomatch_sig_in_siglist(runtmp, capfd): sig1 = get_test_data('1.fa.k21.sig.gz') make_file_list(siglist, [sig2, sig1]) - # with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'index', siglist, '-o', db) captured = capfd.readouterr() - assert os.path.exists(db) # do we want an empty file, or no file? + assert os.path.exists(db) # currently empty file print(runtmp.last_result.out) print(runtmp.last_result.err) print(captured.err) - # assert "Couldn't find a compatible MinHash" in captured.err def test_index_zipfile(runtmp, capfd): @@ -195,7 +186,6 @@ def test_index_zipfile(runtmp, capfd): assert 'index is done' in runtmp.last_result.err captured = capfd.readouterr() print(captured.err) - # assert 'Found 3 filepaths' in captured.err def test_index_zipfile_repeated_md5sums(runtmp, capfd): @@ -254,8 +244,6 @@ def test_index_zipfile_multiparam(runtmp, capfd): assert 'index is done' in runtmp.last_result.err captured = capfd.readouterr() print(captured.err) - # assert 'WARNING: skipped 5 index paths - no compatible signatures.' in captured.err - # assert 'Found 4 filepaths' in captured.err def test_index_zipfile_bad(runtmp, capfd): @@ -278,7 +266,6 @@ def test_index_zipfile_bad(runtmp, capfd): print(captured.err) assert "Couldn't find End Of Central Directory Record" in captured.err - # assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err def test_index_check(runtmp): From 970c4346f532874745ff746c72558c7edbf28869 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sat, 10 Feb 2024 12:49:04 -0800 Subject: [PATCH 41/47] fix issues from merge conflicts --- src/index.rs | 2 -- src/lib.rs | 2 -- src/mastiff_manygather.rs | 2 ++ 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/index.rs b/src/index.rs index 5740d803..3747e6f5 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,5 +1,3 @@ -use camino::Utf8PathBuf as PathBuf; -use sourmash::collection::Collection; use sourmash::index::revindex::RevIndex; use sourmash::prelude::*; use std::path::Path; diff --git a/src/lib.rs b/src/lib.rs index 20fe2108..16df3ae4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,8 +17,6 @@ mod mastiff_manygather; mod mastiff_manysearch; mod multisearch; mod pairwise; -use sourmash::encodings::HashFunctions; -use sourmash::selection::Selection; use camino::Utf8PathBuf as PathBuf; diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index c9a10523..cb794735 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -84,6 +84,8 @@ pub fn mastiff_manygather( intersect_bp: match_.intersect_bp(), }); } + } else { + eprintln!("Error gathering matches: {:?}", matches.err()); } } else { eprintln!( From 4169e50dd1192ff4b312fd3f018a6d6045e331a7 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sat, 10 Feb 2024 13:10:26 -0800 Subject: [PATCH 42/47] clean up tests --- src/python/tests/test_gather.py | 10 ++-------- src/python/tests/test_index.py | 1 - src/python/tests/test_multigather.py | 5 ----- src/python/tests/test_multisearch.py | 6 +++--- src/python/tests/test_pairwise.py | 10 ---------- src/python/tests/test_search.py | 11 +---------- 6 files changed, 6 insertions(+), 37 deletions(-) diff --git a/src/python/tests/test_gather.py b/src/python/tests/test_gather.py index d0376a02..5d4c4108 100644 --- a/src/python/tests/test_gather.py +++ b/src/python/tests/test_gather.py @@ -58,7 +58,6 @@ def test_simple(runtmp, zip_against): assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} - @pytest.mark.parametrize('zip_against', [False, True]) def test_simple_with_prefetch(runtmp, zip_against): # test basic execution! @@ -135,7 +134,7 @@ def test_bad_query(runtmp, capfd, zip_against): # query doesn't need to be a sig anymore - sig, zip, or pathlist welcome # as long as there's only one sketch that matches params - make_file_list(query, [sig2,sig47]) # [sig2] + make_file_list(query, [sig2,sig47]) make_file_list(against_list, [sig2, sig47, sig63]) if zip_against: @@ -161,11 +160,7 @@ def test_missing_against(runtmp, capfd, zip_against): query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') - sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') - - #make_file_list(against_list, [sig2, sig47, sig63]) + # don't make against list if zip_against: against_list = runtmp.output('against.zip') @@ -207,7 +202,6 @@ def test_sig_against(runtmp, capfd): assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} - def test_bad_against(runtmp, capfd): # test bad 'against' file - in this case, one containing a bad filename. query = get_test_data('SRR606249.sig.gz') diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index d228bbcf..053b11e7 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -215,7 +215,6 @@ def test_index_zipfile_repeated_md5sums(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - # assert 'Found 3 filepaths' in captured.err assert 'index is done' in runtmp.last_result.err diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_multigather.py index 1f96eed1..33ed00d6 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_multigather.py @@ -182,8 +182,6 @@ def test_missing_querylist(runtmp, capfd, indexed, zip_query): captured = capfd.readouterr() print(captured.err) - - # assert 'Error: failed to load query' in captured.err assert 'Error: No such file or directory' in captured.err @@ -313,8 +311,6 @@ def test_nomatch_query(runtmp, capfd, indexed, zip_query): captured = capfd.readouterr() print(captured.err) - - # assert "WARNING: no compatible sketches in path " in captured.err assert "WARNING: skipped 1 query paths - no compatible signatures." in captured.err @@ -424,7 +420,6 @@ def test_bad_against_2(runtmp, capfd, zip_query): def test_empty_against(runtmp, capfd): - # like fastgather - exit gracefully. # test bad 'against' file - in this case, an empty one query = get_test_data('SRR606249.sig.gz') query_list = runtmp.output('query.txt') diff --git a/src/python/tests/test_multisearch.py b/src/python/tests/test_multisearch.py index a7b09931..976c8a59 100644 --- a/src/python/tests/test_multisearch.py +++ b/src/python/tests/test_multisearch.py @@ -133,7 +133,6 @@ def test_missing_query(runtmp, capfd, zip_query): sig47 = get_test_data('47.fa.sig.gz') sig63 = get_test_data('63.fa.sig.gz') - #make_file_list(query_list, [sig2, sig47, sig63]) make_file_list(against_list, [sig2, sig47, sig63]) output = runtmp.output('out.csv') @@ -173,6 +172,7 @@ def test_sig_query(runtmp, capfd): df = pandas.read_csv(output) assert len(df) == 1 + def test_bad_query(runtmp, capfd): # test with a bad query list (a missing file) query_list = runtmp.output('query.txt') @@ -238,7 +238,7 @@ def test_missing_against(runtmp, capfd, zip_db): # do not create against_list if zip_db: - #.zip but don't create the file + #specify .zip but don't create the file against_list = runtmp.output('db.zip') output = runtmp.output('out.csv') @@ -263,7 +263,6 @@ def test_sig_against(runtmp, capfd): sig63 = get_test_data('63.fa.sig.gz') make_file_list(query_list, [sig2, sig47, sig63]) - #make_file_list(against_list, [sig2, sig47, sig63]) output = runtmp.output('out.csv') @@ -277,6 +276,7 @@ def test_sig_against(runtmp, capfd): df = pandas.read_csv(output) assert len(df) == 1 + def test_bad_against(runtmp, capfd): # test with a bad against list (a missing file) query_list = runtmp.output('query.txt') diff --git a/src/python/tests/test_pairwise.py b/src/python/tests/test_pairwise.py index 0dd67c05..2207c282 100644 --- a/src/python/tests/test_pairwise.py +++ b/src/python/tests/test_pairwise.py @@ -114,7 +114,6 @@ def test_simple_threshold(runtmp, zip_query): assert len(df) == 1 - def test_sig_query(runtmp, capfd): # sig query is ok now, but fails bc only one sig sig2 = get_test_data('2.fa.sig.gz') @@ -152,7 +151,6 @@ def test_bad_query(runtmp, capfd): def test_bad_query_2(runtmp, capfd): # test with a bad query (a .sig.gz file renamed as zip file) - sig2 = get_test_data('2.fa.sig.gz') sig47 = get_test_data('47.fa.sig.gz') sig63 = get_test_data('63.fa.sig.gz') @@ -274,11 +272,6 @@ def test_load_only_one_bug(runtmp, capfd, zip_db): assert not 'WARNING: no compatible sketches in path ' in captured.err - - - - - @pytest.mark.parametrize("zip_query", [False, True]) def test_md5(runtmp, zip_query): # test that md5s match what was in the original files, not downsampled etc. @@ -314,8 +307,6 @@ def test_md5(runtmp, zip_query): print(md5s) - - def test_simple_prot(runtmp): # test basic execution with protein sigs sigs = get_test_data('protein.zip') @@ -368,7 +359,6 @@ def test_simple_prot(runtmp): assert intersect_hashes == 342 - def test_simple_dayhoff(runtmp): # test basic execution with dayhoff sigs sigs = get_test_data('dayhoff.zip') diff --git a/src/python/tests/test_search.py b/src/python/tests/test_search.py index c6c49c95..f84db833 100644 --- a/src/python/tests/test_search.py +++ b/src/python/tests/test_search.py @@ -229,7 +229,7 @@ def test_missing_query(runtmp, capfd, indexed, zip_query): sig47 = get_test_data('47.fa.sig.gz') sig63 = get_test_data('63.fa.sig.gz') - #make_file_list(query_list, [sig2, sig47, sig63]) + #make_file_list(query_list, [sig2, sig47, sig63]) # don't make query make_file_list(against_list, [sig2, sig47, sig63]) if indexed: @@ -266,15 +266,9 @@ def test_sig_query(runtmp, capfd, indexed): output = runtmp.output('out.csv') - # with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'manysearch', sig2, against_list, '-o', output) - # captured = capfd.readouterr() - # print(captured.err) - - # assert 'Error: invalid line in fromfile' in captured.err - @pytest.mark.parametrize("indexed", [False, True]) def test_bad_query_2(runtmp, capfd, indexed): @@ -363,7 +357,6 @@ def test_nomatch_against(runtmp, capfd): sig2 = get_test_data('2.fa.sig.gz') sig47 = get_test_data('47.fa.sig.gz') sig63 = get_test_data('63.fa.sig.gz') - # nomatch_sketch = get_test_data('genome-s11.fa.gz.sig') nomatch_sketch = get_test_data('SRR606249.sig.gz') make_file_list(query_list, [sig2, sig47, sig63]) @@ -371,7 +364,6 @@ def test_nomatch_against(runtmp, capfd): output = runtmp.output('out.csv') - # with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'manysearch', query_list, against_list, '-o', output) @@ -420,7 +412,6 @@ def test_empty_query(runtmp, indexed, capfd): output = runtmp.output('out.csv') - # with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'manysearch', query_list, against_list, '-o', output) From 994f0ed4688f7bd26b3eb27ddf9f702322143f93 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sat, 10 Feb 2024 15:46:07 -0800 Subject: [PATCH 43/47] rustfmt --- src/mastiff_manysearch.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index effe852e..7d793b2c 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -25,7 +25,7 @@ pub fn mastiff_manysearch( } // Open database once let db = RevIndex::open(index, true)?; - + println!("Loaded DB"); // Load query paths From 862fb65393e45dd34edd225c8b67dc04b194e089 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 12 Feb 2024 16:36:46 -0800 Subject: [PATCH 44/47] make fmg output filenaming robust to spaces in signame --- src/fastmultigather.rs | 3 ++- src/python/tests/test_multigather.py | 32 ++++++++++++++++++++++++++++ src/utils.rs | 3 ++- 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 1ed14f10..dfab2702 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -66,7 +66,8 @@ pub fn fastmultigather( // Load query sig (downsampling happens here) match query_collection.sig_from_record(record) { Ok(query_sig) => { - let prefix = query_sig.name(); + let name = query_sig.name(); + let prefix = name.split(' ').next().unwrap_or_default().to_string(); let location = PathBuf::new(&prefix).file_name().unwrap(); if let Some(query_mh) = query_sig.minhash() { let matchlist: BinaryHeap = against diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_multigather.py index 33ed00d6..91b62e46 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_multigather.py @@ -85,6 +85,38 @@ def test_simple(runtmp, zip_against): assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} +def test_simple_space_in_signame(runtmp): + # test basic execution! + query = get_test_data('SRR606249.sig.gz') + renamed_query = runtmp.output('in.zip') + name = 'my-favorite-signame has spaces' + # rename signature + runtmp.sourmash('sig', 'rename', query, name, '-o', renamed_query) + + sig2 = get_test_data('2.fa.sig.gz') + sig47 = get_test_data('47.fa.sig.gz') + sig63 = get_test_data('63.fa.sig.gz') + + against_list = runtmp.output('against.txt') + + make_file_list(against_list, [sig2, sig47, sig63]) + + cwd = os.getcwd() + try: + os.chdir(runtmp.output('')) + runtmp.sourmash('scripts', 'fastmultigather', renamed_query, against_list, + '-s', '100000', '-t', '0') + finally: + os.chdir(cwd) + + print(os.listdir(runtmp.output(''))) + + g_output = runtmp.output('my-favorite-signame.gather.csv') + p_output = runtmp.output('my-favorite-signame.prefetch.csv') + assert os.path.exists(p_output) + assert os.path.exists(g_output) + + def test_simple_zip_query(runtmp): # test basic execution! query = get_test_data('SRR606249.sig.gz') diff --git a/src/utils.rs b/src/utils.rs index 4d1cc244..12687ce6 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -2,6 +2,7 @@ use rayon::prelude::*; use sourmash::encodings::HashFunctions; use sourmash::manifest::Manifest; +use sourmash::selection; use sourmash::selection::Select; use anyhow::{anyhow, Result}; @@ -75,7 +76,7 @@ pub fn prefetch( .filter_map(|result| { let mut mm = None; let searchsig = &result.minhash; - // TODO: fix Select so we can go back to downsample: false here + // downsample within count_common let overlap = searchsig.count_common(query_mh, true); if let Ok(overlap) = overlap { if overlap >= threshold_hashes { From 3c97baa1c92288bb0dae640881ebfed26e8e00ae Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 12 Feb 2024 16:53:55 -0800 Subject: [PATCH 45/47] minor doc updates --- doc/README.md | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/doc/README.md b/doc/README.md index b3808e45..b858eaf8 100644 --- a/doc/README.md +++ b/doc/README.md @@ -2,29 +2,17 @@ This repository implements five sourmash plugins, `manysketch`, `fastgather`, `fastmultigather`, `multisearch`, and `manysearch`. These plugins make use of multithreading in Rust to provide very fast implementations of `sketch`, `search`, and `gather`. With large databases, these commands can be hundreds to thousands of times faster, and 10-50x lower memory, than sourmash. -The main *drawback* to these plugin commands is that their inputs and outputs are not as rich as the native sourmash commands. In particular, this means that input databases need to be prepared differently. Moreover, the output may be most useful as a prefilter in conjunction with regular sourmash commands - see the instructions below for using `fastgather` to create picklists for sourmash. +The main *drawback* to these plugin commands is that their inputs and outputs are not as rich as the native sourmash commands. This may mean that your input files need to be prepared differently. The output may currently be most useful as a prefilter in conjunction with regular sourmash commands - see the instructions below for using `fastgather` to create picklists for sourmash. ## Input file formats -All four search/gather commands use either zip files or _text files containing lists of signature files_ ("fromfiles") for the search database. `multisearch`, `manysearch` and `fastmultigather` also use either zips or "fromfiles" for queries, too. +All four search/gather commands accept zip files or _text files containing lists of signature files_ ("fromfiles") for the search database. `multisearch`, `manysearch` and `fastmultigather` also use either zips or "fromfiles" for queries, too. All commands now accept single signature files as well, though this is only useful for single-query input. `manysketch` takes as input a CSV file with columns `name,genome_filename,protein_filename`. If you don't have `protein_filename` entries, be sure to include the trailing comma so the CSV reader can process the file correctly. ### Using zip files -Zip files are used in two ways, depending on how the command works. - -If the command loads a collection of sketches into memory at the start, then the sketches from the zip file are simply loaded into memory! So, -* `multisearch` loads both query and database into memory; -* `manysearch` loads the queries into memory; -* `fastmultigather` loads the search database into memory; - -If the command loads a collection of sketches throughout execution, then the zip file is _unpacked_ to a temporary directory and the sketches are loaded from there. (This can consume a lot of extra disk space!) So, -* `manysearch` loads the sketches being searched this way; -* `fastgather` loads the database sketches this way; -* `fastmultigather` loads the query sketches this way; - -Note that the temp directory is created under the path specified in the `TMPDIR` environment variable if it is set, otherwise it returns `/tmp`. +Signature zip files are the most efficient file to load, as they contain 'manifest' files with parameter information for each included sketch. When loading the zipfile, we can select relevant signatures without loading the sketches themselves into memory. We then only load the actual sketches (and optionally, downsample to a lower scaled value) when we're ready to use them. ### Using "fromfiles" @@ -41,6 +29,8 @@ and then build a "fromfile": find gtdb-reps-rs214-k21/ -name "*.sig.gz" -type f > list.gtdb-reps-rs214-k21.txt ``` +When using these files for search, we have no a priori information about the parameters used for each sketch, so we load all signatures into memory at the start. + ## Running the commands ### Running `manysketch` @@ -101,7 +91,7 @@ The results file here, `query.x.gtdb-reps.csv`, will have 8 columns: `query` and The `fastgather` command is a much faster version of `sourmash gather`. -`fastgather` takes a query metagenome and an input collection (zip or "fromfile") as database, and outputs a CSV: +`fastgather` takes a single query metagenome (in any file format) and an input collection (zip or "fromfile") as database, and outputs a CSV: ``` sourmash scripts fastgather query.sig.gz podar-ref-list.txt -o results.csv --cores 4 ``` @@ -144,9 +134,9 @@ The main advantage that `fastmultigather` has over running `fastgather` on multi `fastmultigather` will output two CSV files for each query, a `prefetch` file containing all overlapping matches between that query and the database, and a `gather` file containing the minimum metagenome cover for that query in the database. -The prefetch CSV will be named `{basename}.prefetch.csv`, and the gather CSV will be named `{basename}.gather.csv`. Here, `{basename}` is the filename, stripped of its path. If zipfiles are used, `{basename}` will be the md5sum. +The prefetch CSV will be named `{signame}.prefetch.csv`, and the gather CSV will be named `{signame}.gather.csv`. Here, `{signame}` is the name of your sourmash signature. -**Warning:** At the moment, if two different queries have the same `{basename}`, the CSVs for one of the queries will be overwritten by the other query. The behavior here is undefined in practice, because of multithreading: we don't know what queries will be executed when or files will be written first. +**Warning:** At the moment, if two different queries have the same `{signame}`, the CSVs for one of the queries will be overwritten by the other query. The behavior here is undefined in practice, because of multithreading: we don't know what queries will be executed when or files will be written first. ### Running `manysearch` From 7ce7ff9f150ad8691952f2bdeb642dc252e67c5a Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 13 Feb 2024 08:27:24 -0800 Subject: [PATCH 46/47] disable rocksdb for fastgather --- src/lib.rs | 7 +++++++ src/python/tests/test_gather.py | 29 +++++++++++++++++++++++++++++ src/utils.rs | 1 - 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 16df3ae4..bedfaba2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -83,6 +83,13 @@ fn do_fastgather( let selection = build_selection(ksize, scaled, &moltype); let allow_failed_sigpaths = true; + // disable rocksdb input as database + let againstfile_path: camino::Utf8PathBuf = siglist_path.clone().into(); + if is_revindex_database(&againstfile_path) { + eprintln!("Fastgather does not accept 'rocksdb' databases. Please use fastmultigather."); + // exit + return Ok(1); + } match fastgather::fastgather( query_filename, siglist_path, diff --git a/src/python/tests/test_gather.py b/src/python/tests/test_gather.py index 5d4c4108..8b9c7d80 100644 --- a/src/python/tests/test_gather.py +++ b/src/python/tests/test_gather.py @@ -628,3 +628,32 @@ def test_simple_hp(runtmp): assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} print(df) assert df['match_md5'][0] == "ea2a1ad233c2908529d124a330bcb672" + + +def test_indexed_against(runtmp, capfd): + # do not accept rocksdb for now + query = get_test_data('SRR606249.sig.gz') + against_list = runtmp.output('against.txt') + + sig2 = get_test_data('2.fa.sig.gz') + + make_file_list(against_list, [sig2]) + db_against = runtmp.output('against.rocksdb') + + ## index against + runtmp.sourmash('scripts', 'index', against_list, + '-o', db_against, '-k', str(31), '--scaled', str(1000), + '--moltype', "DNA") + + g_output = runtmp.output('gather.csv') + p_output = runtmp.output('prefetch.csv') + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'fastgather', query, db_against, + '-o', g_output, '--output-prefetch', p_output, + '-s', '100000') + + captured = capfd.readouterr() + print(captured.err) + + assert "Fastgather does not accept 'rocksdb' databases. Please use fastmultigather." in captured.err diff --git a/src/utils.rs b/src/utils.rs index 12687ce6..3a028e8f 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -2,7 +2,6 @@ use rayon::prelude::*; use sourmash::encodings::HashFunctions; use sourmash::manifest::Manifest; -use sourmash::selection; use sourmash::selection::Select; use anyhow::{anyhow, Result}; From c9cda17b12c6e2f0688feaee53fddde5d6480947 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 13 Feb 2024 08:42:09 -0800 Subject: [PATCH 47/47] instead, disable rocksdb reading within load_collection --- src/lib.rs | 7 ------- src/python/tests/test_gather.py | 2 +- src/utils.rs | 6 ++++++ 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index bedfaba2..16df3ae4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -83,13 +83,6 @@ fn do_fastgather( let selection = build_selection(ksize, scaled, &moltype); let allow_failed_sigpaths = true; - // disable rocksdb input as database - let againstfile_path: camino::Utf8PathBuf = siglist_path.clone().into(); - if is_revindex_database(&againstfile_path) { - eprintln!("Fastgather does not accept 'rocksdb' databases. Please use fastmultigather."); - // exit - return Ok(1); - } match fastgather::fastgather( query_filename, siglist_path, diff --git a/src/python/tests/test_gather.py b/src/python/tests/test_gather.py index 8b9c7d80..5a6f4f62 100644 --- a/src/python/tests/test_gather.py +++ b/src/python/tests/test_gather.py @@ -656,4 +656,4 @@ def test_indexed_against(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert "Fastgather does not accept 'rocksdb' databases. Please use fastmultigather." in captured.err + assert "Cannot load search signatures from a 'rocksdb' database. Please use sig, zip, or pathlist." in captured.err diff --git a/src/utils.rs b/src/utils.rs index 3a028e8f..980eeda3 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -324,6 +324,12 @@ pub fn load_collection( if !sigpath.exists() { bail!("No such file or directory: '{}'", &sigpath); } + + // disallow rocksdb input here + if is_revindex_database(&sigpath) { + bail!("Cannot load {} signatures from a 'rocksdb' database. Please use sig, zip, or pathlist.", report_type); + } + eprintln!("Reading {}(s) from: '{}'", report_type, &siglist); let mut n_failed = 0;