Skip to content

Commit

Permalink
MRG: core: add scaled selection to manifest; add helper functions for…
Browse files Browse the repository at this point in the history
… collection and sig/sketch usage (#2948)

This PR adds:

## New functions:
- `Collection::sig_from_record`
> when we iterate through the sketches, we have both `idx` and `record`
available. I thought it would make sense to just use record directly,
rather than re-getting record from index.
- `Signature::minhash` 
> if there is one minhash sketch available within the sig that matches
selection params, return it
- `Signature::get_sketch`
> if there is one sketch (of any type) available within the sig that
matches selection params, return it. Note that since this returns the
sketch enum, it still requires checking MinHash type afterwards.
@luizirber is there a way to return any of the sketches directly from
the same function (like minhash function, above, but more flexible?).
- `Manifest::From<&PathBuf>`
> build a `manifest` directly from a pathlist file. Added and tested,
but lmk if you think we should just build a list of paths separately. I
wanted this for branchwater, but am not actually using it since neither
the paths or PathBuf loading code allow missing/failed paths.


## New selection functionality
- added `scaled` and `num` selection to manifest. For scaled, if sketch
is compatible (equal scaled or can be downsampled), keep it during
manifest selection. Otherwise, discard.

Tests added for each new function/added code.

Co-authored-by: Luiz Irber <[email protected]>
  • Loading branch information
bluegenes and luizirber authored Feb 5, 2024
1 parent fee6292 commit 9f36e2f
Show file tree
Hide file tree
Showing 3 changed files with 465 additions and 8 deletions.
219 changes: 219 additions & 0 deletions src/core/src/collection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,14 @@ impl Collection {
assert_eq!(sig.signatures.len(), 1);
Ok(sig)
}

pub fn sig_from_record(&self, record: &Record) -> Result<SigStore> {
let match_path = record.internal_location().as_str();
let selection = Selection::from_record(record)?;
let sig = self.storage.load_sig(match_path)?.select(&selection)?;
assert_eq!(sig.signatures.len(), 1);
Ok(sig)
}
}

impl Select for Collection {
Expand All @@ -188,3 +196,214 @@ impl Select for Collection {
Ok(self)
}
}

#[cfg(test)]
mod test {
use camino::Utf8PathBuf as PathBuf;
use std::fs::File;
use std::io::BufReader;

use super::Collection;

use crate::encodings::HashFunctions;
use crate::prelude::Select;
use crate::selection::Selection;
use crate::signature::Signature;

#[test]
fn sigstore_selection_with_downsample() {
// load test sigs
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename.push("../../tests/test-data/47+63-multisig.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
// create Selection object
let mut selection = Selection::default();
selection.set_scaled(2000);
// load sigs into collection + select compatible signatures
let cl = Collection::from_sigs(sigs)
.unwrap()
.select(&selection)
.unwrap();
// count collection length
assert_eq!(cl.len(), 6);
for (idx, _rec) in cl.iter() {
// need to pass select again here so we actually downsample
let this_sig = cl.sig_for_dataset(idx).unwrap().select(&selection).unwrap();
let this_mh = this_sig.minhash().unwrap();
assert_eq!(this_mh.scaled(), 2000);
}
}

#[test]
fn sigstore_selection_with_downsample_too_low() {
// load test sigs
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename.push("../../tests/test-data/47+63-multisig.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
// create Selection object
let mut selection = Selection::default();
selection.set_scaled(500);
// load sigs into collection + select compatible signatures
let cl = Collection::from_sigs(sigs)
.unwrap()
.select(&selection)
.unwrap();
// no sigs should remain
assert_eq!(cl.len(), 0);
}

#[test]
fn sigstore_selection_scaled_handle_num_sig() {
// load test sigs
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
// four num=500 sigs
filename.push("../../tests/test-data/genome-s11.fa.gz.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
assert_eq!(sigs.len(), 4);
// create Selection object
let mut selection = Selection::default();
selection.set_scaled(1000);
// load sigs into collection + select compatible signatures
let cl = Collection::from_sigs(sigs)
.unwrap()
.select(&selection)
.unwrap();
// no sigs should remain
assert_eq!(cl.len(), 0);
}

#[test]
fn sigstore_selection_num() {
// load test sigs
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
// four num=500 sigs
filename.push("../../tests/test-data/genome-s11.fa.gz.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
let sigs_copy = sigs.clone();
assert_eq!(sigs.len(), 4);
// create Selection object
let mut selection = Selection::default();
selection.set_num(500);
// load sigs into collection + select compatible signatures
let cl = Collection::from_sigs(sigs)
.unwrap()
.select(&selection)
.unwrap();
// all sigs should remain
assert_eq!(cl.len(), 4);
//now select diff num and none should remain
selection.set_num(100);
let cl2 = Collection::from_sigs(sigs_copy)
.unwrap()
.select(&selection)
.unwrap();
assert_eq!(cl2.len(), 0);
}

#[test]
fn sigstore_selection_num_handle_scaled_sig() {
// load test sigs
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
// four num=500 sigs
filename.push("../../tests/test-data/47+63-multisig.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
assert_eq!(sigs.len(), 6);
// create Selection object
let mut selection = Selection::default();
selection.set_num(500);
// load sigs into collection + select compatible signatures
let cl = Collection::from_sigs(sigs)
.unwrap()
.select(&selection)
.unwrap();
// no sigs should remain
assert_eq!(cl.len(), 0);
}

#[test]
fn sigstore_sig_from_record() {
// load test sigs
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename.push("../../tests/test-data/47+63-multisig.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
// create Selection object
let mut selection = Selection::default();
selection.set_scaled(2000);
// load sigs into collection + select compatible signatures
let cl = Collection::from_sigs(sigs)
.unwrap()
.select(&selection)
.unwrap();
// no sigs should remain
assert_eq!(cl.len(), 6);
for (_idx, rec) in cl.iter() {
// need to pass select again here so we actually downsample
let this_sig = cl.sig_from_record(rec).unwrap().select(&selection).unwrap();
let this_mh = this_sig.minhash().unwrap();
assert_eq!(this_mh.scaled(), 2000);
}
}

#[test]
fn sigstore_selection_moltype_zip() {
// load test sigs
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename.push("../../tests/test-data/prot/hp.zip");
// create Selection object
let mut selection = Selection::default();
selection.set_scaled(100);
selection.set_moltype(HashFunctions::Murmur64Hp);
// load sigs into collection + select compatible signatures
let cl = Collection::from_zipfile(&filename)
.unwrap()
.select(&selection)
.unwrap();
// count collection length
assert_eq!(cl.len(), 2);
for (idx, _rec) in cl.iter() {
// need to pass select again here so we actually downsample
let this_sig = cl.sig_for_dataset(idx).unwrap().select(&selection).unwrap();
let this_mh = this_sig.minhash().unwrap();
assert_eq!(this_mh.scaled(), 100);
}
}

#[test]
fn sigstore_selection_moltype_sig() {
// load test sigs
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename
.push("../../tests/test-data/prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
// create Selection object
let mut selection = Selection::default();
selection.set_moltype(HashFunctions::Murmur64Hp);
// load sigs into collection + select compatible signatures
let cl = Collection::from_sigs(sigs)
.unwrap()
.select(&selection)
.unwrap();
// count collection length
assert_eq!(cl.len(), 1);
for (idx, _rec) in cl.iter() {
// need to pass select again here so we actually downsample
let this_sig = cl.sig_for_dataset(idx).unwrap().select(&selection).unwrap();
let this_mh = this_sig.minhash().unwrap();
assert_eq!(this_mh.scaled(), 100);
}
}
}
113 changes: 112 additions & 1 deletion src/core/src/manifest.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::convert::TryInto;
use std::io::{Read, Write};
use std::fs::File;
use std::io::{BufRead, BufReader, Read, Write};
use std::ops::Deref;

use camino::Utf8PathBuf as PathBuf;
Expand Down Expand Up @@ -200,6 +201,17 @@ impl Select for Manifest {
} else {
valid
};
valid = if let Some(scaled) = selection.scaled() {
// num sigs have row.scaled = 0, don't include them
valid && row.scaled != 0 && row.scaled <= scaled as u64
} else {
valid
};
valid = if let Some(num) = selection.num() {
valid && row.num == num
} else {
valid
};
valid
});

Expand Down Expand Up @@ -270,10 +282,109 @@ impl From<&[PathBuf]> for Manifest {
}
}

impl From<&PathBuf> for Manifest {
fn from(pathlist: &PathBuf) -> Self {
let file = File::open(pathlist).unwrap_or_else(|_| panic!("Failed to open {:?}", pathlist));
let reader = BufReader::new(file);

let paths: Vec<PathBuf> = reader
.lines()
.map(|line| line.unwrap_or_else(|_| panic!("Failed to read line from {:?}", pathlist)))
.map(PathBuf::from)
.collect();

paths.as_slice().into()
}
}

impl Deref for Manifest {
type Target = Vec<Record>;

fn deref(&self) -> &Self::Target {
&self.records
}
}

#[cfg(test)]
mod test {
use camino::Utf8PathBuf as PathBuf;
use std::fs::File;
use std::io::Write;
use tempfile::TempDir;

use super::Manifest;

#[test]
fn manifest_from_pathlist() {
let temp_dir = TempDir::new().unwrap();
let utf8_output = PathBuf::from_path_buf(temp_dir.path().to_path_buf())
.expect("Path should be valid UTF-8");
let mut filename = utf8_output.join("sig-pathlist.txt");
//convert to camino utf8pathbuf
filename = PathBuf::from(filename);
// build sig filenames
let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
let test_sigs = vec![
"../../tests/test-data/47.fa.sig",
"../../tests/test-data/63.fa.sig",
];

let full_paths: Vec<_> = test_sigs
.into_iter()
.map(|sig| base_path.join(sig))
.collect();

// write a file in test directory with a filename on each line
let mut pathfile = File::create(&filename).unwrap();
for sigfile in &full_paths {
writeln!(pathfile, "{}", sigfile).unwrap();
}

// load into manifest
let manifest = Manifest::from(&filename);
assert_eq!(manifest.len(), 2);
}

#[test]
#[should_panic(expected = "Failed to open \"no-exist\"")]
fn manifest_from_pathlist_nonexistent_file() {
let filename = PathBuf::from("no-exist");
let _manifest = Manifest::from(&filename);
}

#[test]
#[should_panic]
fn manifest_from_pathlist_badfile() {
let temp_dir = TempDir::new().unwrap();
let utf8_output = PathBuf::from_path_buf(temp_dir.path().to_path_buf())
.expect("Path should be valid UTF-8");
let mut filename = utf8_output.join("sig-pathlist.txt");
//convert to camino utf8pathbuf
filename = PathBuf::from(filename);

let mut pathfile = File::create(&filename).unwrap();
write!(pathfile, "Valid line\n").unwrap();
pathfile.write_all(&[0xED, 0xA0, 0x80]).unwrap(); // invalid UTF-8

// load into manifest
let _manifest = Manifest::from(&filename);
}

#[test]
#[should_panic]
fn manifest_from_paths_badpath() {
let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
let test_sigs = vec![
PathBuf::from("no-exist"),
PathBuf::from("../../tests/test-data/63.fa.sig"),
];

let full_paths: Vec<PathBuf> = test_sigs
.into_iter()
.map(|sig| base_path.join(sig))
.collect();

// load into manifest
let _manifest = Manifest::from(&full_paths[..]); // pass full_paths as a slice
}
}
Loading

0 comments on commit 9f36e2f

Please sign in to comment.