Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MRG: improve downsampling behavior on KmerMinHash; fix RevIndex::gather bug around scaled. #3342

Merged
merged 25 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions include/sourmash.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ enum SourmashErrorCode {
SOURMASH_ERROR_CODE_NON_EMPTY_MIN_HASH = 106,
SOURMASH_ERROR_CODE_MISMATCH_NUM = 107,
SOURMASH_ERROR_CODE_NEEDS_ABUNDANCE_TRACKING = 108,
SOURMASH_ERROR_CODE_CANNOT_UPSAMPLE_SCALED = 109,
SOURMASH_ERROR_CODE_NO_MIN_HASH_FOUND = 110,
SOURMASH_ERROR_CODE_EMPTY_SIGNATURE = 111,
SOURMASH_ERROR_CODE_MULTIPLE_SKETCHES_FOUND = 112,
SOURMASH_ERROR_CODE_INVALID_DNA = 1101,
SOURMASH_ERROR_CODE_INVALID_PROT = 1102,
SOURMASH_ERROR_CODE_INVALID_CODON_LENGTH = 1103,
Expand Down
2 changes: 1 addition & 1 deletion src/core/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "sourmash"
version = "0.15.2"
version = "0.16.0"
authors = ["Luiz Irber <[email protected]>", "N. Tessa Pierce-Ward <[email protected]>"]
description = "tools for comparing biological sequences with k-mer sketches"
repository = "https://github.com/sourmash-bio/sourmash"
Expand Down
1 change: 1 addition & 0 deletions src/core/src/collection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ mod test {
use crate::prelude::Select;
use crate::selection::Selection;
use crate::signature::Signature;
#[cfg(all(feature = "branchwater", not(target_arch = "wasm32")))]
use crate::Result;

#[test]
Expand Down
20 changes: 20 additions & 0 deletions src/core/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ pub enum SourmashError {
#[error("internal error: {message:?}")]
Internal { message: String },

#[error("new scaled smaller than previous; cannot upsample")]
CannotUpsampleScaled,

#[error("must have same num: {n1} != {n2}")]
MismatchNum { n1: u32, n2: u32 },

Expand All @@ -28,6 +31,15 @@ pub enum SourmashError {
#[error("sketch needs abundance for this operation")]
NeedsAbundanceTracking,

#[error("Expected a MinHash sketch in this signature")]
NoMinHashFound,

#[error("Empty signature")]
EmptySignature,

#[error("Multiple sketches found, expected one")]
MultipleSketchesFound,

#[error("Invalid hash function: {function:?}")]
InvalidHashFunction { function: String },

Expand Down Expand Up @@ -104,6 +116,10 @@ pub enum SourmashErrorCode {
NonEmptyMinHash = 1_06,
MismatchNum = 1_07,
NeedsAbundanceTracking = 1_08,
CannotUpsampleScaled = 1_09,
NoMinHashFound = 1_10,
EmptySignature = 1_11,
MultipleSketchesFound = 1_12,
// Input sequence errors
InvalidDNA = 11_01,
InvalidProt = 11_02,
Expand Down Expand Up @@ -132,6 +148,7 @@ impl SourmashErrorCode {
match error {
SourmashError::Internal { .. } => SourmashErrorCode::Internal,
SourmashError::Panic { .. } => SourmashErrorCode::Panic,
SourmashError::CannotUpsampleScaled { .. } => SourmashErrorCode::CannotUpsampleScaled,
SourmashError::MismatchNum { .. } => SourmashErrorCode::MismatchNum,
SourmashError::NeedsAbundanceTracking { .. } => {
SourmashErrorCode::NeedsAbundanceTracking
Expand All @@ -142,6 +159,9 @@ impl SourmashErrorCode {
SourmashError::MismatchSeed => SourmashErrorCode::MismatchSeed,
SourmashError::MismatchSignatureType => SourmashErrorCode::MismatchSignatureType,
SourmashError::NonEmptyMinHash { .. } => SourmashErrorCode::NonEmptyMinHash,
SourmashError::NoMinHashFound => SourmashErrorCode::NoMinHashFound,
SourmashError::EmptySignature => SourmashErrorCode::EmptySignature,
SourmashError::MultipleSketchesFound => SourmashErrorCode::MultipleSketchesFound,
SourmashError::InvalidDNA { .. } => SourmashErrorCode::InvalidDNA,
SourmashError::InvalidProt { .. } => SourmashErrorCode::InvalidProt,
SourmashError::InvalidCodonLength { .. } => SourmashErrorCode::InvalidCodonLength,
Expand Down
17 changes: 14 additions & 3 deletions src/core/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ use getset::{CopyGetters, Getters, Setters};
use log::trace;
use serde::{Deserialize, Serialize};
use stats::{median, stddev};
use std::cmp::max;
use typed_builder::TypedBuilder;

use crate::ani_utils::{ani_ci_from_containment, ani_from_containment};
Expand Down Expand Up @@ -205,7 +206,6 @@ where
}
}

// note all mh should be selected/downsampled prior to being passed in here.
#[allow(clippy::too_many_arguments)]
pub fn calculate_gather_stats(
orig_query: &KmerMinHash,
Expand All @@ -220,10 +220,21 @@ pub fn calculate_gather_stats(
confidence: Option<f64>,
) -> Result<GatherResult> {
// get match_mh
let match_mh = match_sig.minhash().unwrap();
let match_mh = match_sig.minhash().expect("cannot retrieve sketch");

let max_scaled = max(match_mh.scaled(), query.scaled());
let query = query
.downsample_scaled(max_scaled)
.expect("cannot downsample query");
let match_mh = match_mh
.clone()
.downsample_scaled(max_scaled)
.expect("cannot downsample match");

// calculate intersection
let isect = match_mh.intersection(&query)?;
let isect = match_mh
.intersection(&query)
.expect("could not do intersection");
let isect_size = isect.0.len();
trace!("isect_size: {}", isect_size);
trace!("query.size: {}", query.size());
Expand Down
20 changes: 12 additions & 8 deletions src/core/src/index/revindex/disk_revindex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,6 @@ impl RevIndexOps for RevIndex {
let mut query = KmerMinHashBTree::from(orig_query.clone());
let mut sum_weighted_found = 0;
let _selection = selection.unwrap_or_else(|| self.collection.selection());
let mut orig_query_ds = orig_query.clone();
let total_weighted_hashes = orig_query.sum_abunds();

// or set this with user --track-abundance?
Expand All @@ -393,29 +392,33 @@ impl RevIndexOps for RevIndex {
break;
}

// this should downsample mh for us
let match_sig = self.collection.sig_for_dataset(dataset_id)?;

// get downsampled minhashes for comparison.
let match_mh = match_sig.minhash().unwrap().clone();
query = query.downsample_scaled(match_mh.scaled())?;
orig_query_ds = orig_query_ds.downsample_scaled(match_mh.scaled())?;
let scaled = query.scaled();

let match_mh = match_mh
.downsample_scaled(scaled)
.expect("cannot downsample match");

// just calculate essentials here
let gather_result_rank = matches.len();

let query_mh = KmerMinHash::from(query.clone());

// grab the specific intersection:
let isect = match_mh.intersection(&query_mh)?;
let isect = match_mh
.intersection(&query_mh)
.expect("failed to intersect");
let mut isect_mh = match_mh.clone();
isect_mh.clear();
isect_mh.add_many(&isect.0)?;

// Calculate stats
let gather_result = calculate_gather_stats(
&orig_query_ds,
KmerMinHash::from(query.clone()),
&orig_query,
query_mh,
match_sig,
match_size,
gather_result_rank,
Expand All @@ -424,7 +427,8 @@ impl RevIndexOps for RevIndex {
calc_abund_stats,
calc_ani_ci,
ani_confidence_interval_fraction,
)?;
)
.expect("could not calculate gather stats");
// keep track of the sum weighted found
sum_weighted_found = gather_result.sum_weighted_found();
matches.push(gather_result);
Expand Down
18 changes: 10 additions & 8 deletions src/core/src/index/revindex/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -903,14 +903,16 @@ mod test {

let (counter, query_colors, hash_to_color) = index.prepare_gather_counters(&query);

let matches_external = index.gather(
counter,
query_colors,
hash_to_color,
0,
&query,
Some(selection.clone()),
)?;
let matches_external = index
.gather(
counter,
query_colors,
hash_to_color,
0,
&query,
Some(selection.clone()),
)
.expect("failed to gather!");

{
let mut index = index;
Expand Down
24 changes: 23 additions & 1 deletion src/core/src/signature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -842,7 +842,7 @@
// TODO: also account for LargeMinHash
if let Sketch::MinHash(mh) = sketch {
if (mh.scaled() as u32) < sel_scaled {
*sketch = Sketch::MinHash(mh.downsample_scaled(sel_scaled as u64)?);
*sketch = Sketch::MinHash(mh.clone().downsample_scaled(sel_scaled as u64)?);
}
}
}
Expand Down Expand Up @@ -887,6 +887,28 @@
}
}

impl TryInto<KmerMinHash> for Signature {
type Error = Error;

fn try_into(self) -> Result<KmerMinHash, Error> {
match self.signatures.len() {
1 => self

Check warning on line 895 in src/core/src/signature.rs

View check run for this annotation

Codecov / codecov/patch

src/core/src/signature.rs#L893-L895

Added lines #L893 - L895 were not covered by tests
.signatures
.into_iter()
.find_map(|sk| {
if let Sketch::MinHash(mh) = sk {

Check warning on line 899 in src/core/src/signature.rs

View check run for this annotation

Codecov / codecov/patch

src/core/src/signature.rs#L898-L899

Added lines #L898 - L899 were not covered by tests
Some(mh)
} else {
None

Check warning on line 902 in src/core/src/signature.rs

View check run for this annotation

Codecov / codecov/patch

src/core/src/signature.rs#L902

Added line #L902 was not covered by tests
}
})
.ok_or_else(|| Error::NoMinHashFound),
0 => Err(Error::EmptySignature),
_ => Err(Error::MultipleSketchesFound),

Check warning on line 907 in src/core/src/signature.rs

View check run for this annotation

Codecov / codecov/patch

src/core/src/signature.rs#L905-L907

Added lines #L905 - L907 were not covered by tests
}
}
}

#[cfg(test)]
mod test {
use std::fs::File;
Expand Down
Loading
Loading