From 3009725f618aabb611ec593cc0dea10e27a743d1 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 19 Jun 2024 06:39:56 -0700 Subject: [PATCH] MRG: adjust how ANI is calculated in the revindex code. (#3218) Calculate ANI of matches against original query with `f_orig_query` and `f_match_orig`, instead of against `f_unique_to_query` and `f_match`. This fixes the ANI differences between `sourmash gather` and RocksDB branchwater gather for the columns `query_containment_ani`, `match_containment_ani`, `max_containment_ani`, and `average_containment_ani`. Refs: * Used by https://github.com/sourmash-bio/sourmash_plugin_branchwater/pull/361 * Fixes RocksDB-based calculations for https://github.com/sourmash-bio/sourmash_plugin_branchwater/issues/331 --- src/core/Cargo.toml | 2 +- src/core/src/index/mod.rs | 4 ++-- src/core/src/index/revindex/mod.rs | 3 +++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 71249863a1..c89cb0d28d 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -44,7 +44,7 @@ md5 = "0.7.0" memmap2 = "0.9.4" murmurhash3 = "0.0.5" needletail = { version = "0.5.1", default-features = false } -niffler = { version = "2.6.0", default-features = false, features = [ "gz" ] } +niffler = { version = "2.4.0", default-features = false, features = [ "gz" ] } nohash-hasher = "0.2.0" num-iter = "0.1.45" once_cell = "1.18.0" diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index 3ce2652808..4816a56cc8 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -244,8 +244,8 @@ pub fn calculate_gather_stats( // // get ANI values let ksize = match_mh.ksize() as f64; - let query_containment_ani = ani_from_containment(f_unique_to_query, ksize); - let match_containment_ani = ani_from_containment(f_match, ksize); + let query_containment_ani = ani_from_containment(f_orig_query, ksize); + let match_containment_ani = ani_from_containment(f_match_orig, ksize); let mut query_containment_ani_ci_low = None; let mut query_containment_ani_ci_high = None; let mut match_containment_ani_ci_low = None; diff --git a/src/core/src/index/revindex/mod.rs b/src/core/src/index/revindex/mod.rs index fc03896385..8e2b35f716 100644 --- a/src/core/src/index/revindex/mod.rs +++ b/src/core/src/index/revindex/mod.rs @@ -814,6 +814,7 @@ mod test { assert_eq!(round5(match_.f_unique_to_query()), round5(0.13096862)); assert_eq!(match_.unique_intersect_bp, 1920000); assert_eq!(match_.remaining_bp, 12740000); + assert_eq!(round5(match_.query_containment_ani()), round5(0.90773763)); let match_ = &matches[1]; let names: Vec<&str> = match_.name().split(' ').take(1).collect(); @@ -822,6 +823,7 @@ mod test { assert_eq!(round5(match_.f_unique_to_query()), round5(0.115279)); assert_eq!(match_.unique_intersect_bp, 1690000); assert_eq!(match_.remaining_bp, 11050000); + assert_eq!(round5(match_.query_containment_ani()), round5(0.9068280)); let match_ = &matches[2]; dbg!(match_); @@ -831,6 +833,7 @@ mod test { assert_eq!(round5(match_.f_unique_to_query()), round5(0.0627557)); assert_eq!(match_.unique_intersect_bp, 920000); assert_eq!(match_.remaining_bp, 10130000); + assert_eq!(round5(match_.query_containment_ani()), round5(0.90728512)); Ok(()) }