Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace mx by scaled #1139

Merged
merged 11 commits into from
Oct 8, 2020
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,6 @@ Cargo.lock
pkg/
wasm-pack.log
.hypothesis

# IDEs
.vscode/
10 changes: 4 additions & 6 deletions include/sourmash.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,14 +174,12 @@ SourmashStr kmerminhash_md5sum(const SourmashKmerMinHash *ptr);

void kmerminhash_merge(SourmashKmerMinHash *ptr, const SourmashKmerMinHash *other);

SourmashKmerMinHash *kmerminhash_new(uint32_t n,
SourmashKmerMinHash *kmerminhash_new(uint64_t scaled,
uint32_t k,
bool prot,
bool dayhoff,
bool hp,
HashFunctions hash_function,
uint64_t seed,
uint64_t mx,
bool track_abundance);
bool track_abundance,
uint32_t n);

uint32_t kmerminhash_num(const SourmashKmerMinHash *ptr);

Expand Down
34 changes: 28 additions & 6 deletions sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,20 @@ def _get_max_hash_for_scaled(scaled):
elif scaled == 1:
return get_minhash_max_hash()

return int(round(get_minhash_max_hash() / scaled, 0))
return min(
int(round(get_minhash_max_hash() / scaled, 0)),
MINHASH_MAX_HASH
)


def _get_scaled_for_max_hash(max_hash):
"Convert a 'max_hash' value into a 'scaled' value."
if max_hash == 0:
return 0
return int(round(get_minhash_max_hash() / max_hash, 0))
return min(
int(round(get_minhash_max_hash() / max_hash, 0)),
MINHASH_MAX_HASH
)


def to_bytes(s):
Expand Down Expand Up @@ -179,10 +185,17 @@ def __init__(
if dayhoff or hp:
is_protein = False

# ok, for Rust API, go from scaled back to max_hash
max_hash = _get_max_hash_for_scaled(scaled)
if dayhoff:
hash_function = lib.HASH_FUNCTIONS_MURMUR64_DAYHOFF
elif hp:
hash_function = lib.HASH_FUNCTIONS_MURMUR64_HP
elif is_protein:
hash_function = lib.HASH_FUNCTIONS_MURMUR64_PROTEIN
else:
hash_function = lib.HASH_FUNCTIONS_MURMUR64_DNA

self._objptr = lib.kmerminhash_new(
n, ksize, is_protein, dayhoff, hp, seed, int(max_hash), track_abundance
scaled, ksize, hash_function, seed, track_abundance, n
)

if mins:
Expand Down Expand Up @@ -227,8 +240,17 @@ def __setstate__(self, tup):
max_hash, seed) = tup

self.__del__()

hash_function = (
lib.HASH_FUNCTIONS_MURMUR64_DAYHOFF if dayhoff else
lib.HASH_FUNCTIONS_MURMUR64_HP if hp else
lib.HASH_FUNCTIONS_MURMUR64_PROTEIN if is_protein else
lib.HASH_FUNCTIONS_MURMUR64_DNA
Comment on lines +245 to +248
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it makes sense that this works, but I've never seen it being used this way =]

)

scaled = _get_scaled_for_max_hash(max_hash)
self._objptr = lib.kmerminhash_new(
n, ksize, is_protein, dayhoff, hp, seed, max_hash, track_abundance
scaled, ksize, hash_function, seed, track_abundance, n
)
if track_abundance:
self.set_abundances(mins)
Expand Down
2 changes: 1 addition & 1 deletion src/core/src/cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ impl Default for ComputeParameters {
}

pub fn build_template(params: &ComputeParameters) -> Vec<Sketch> {
let max_hash = max_hash_for_scaled(params.scaled).unwrap_or(0);
let max_hash = max_hash_for_scaled(params.scaled);

params
.ksizes
Expand Down
28 changes: 9 additions & 19 deletions src/core/src/ffi/minhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,14 @@ impl ForeignObject for SourmashKmerMinHash {

#[no_mangle]
pub unsafe extern "C" fn kmerminhash_new(
n: u32,
scaled: u64,
k: u32,
prot: bool,
dayhoff: bool,
hp: bool,
hash_function: HashFunctions,
seed: u64,
mx: u64,
track_abundance: bool,
n: u32,
) -> *mut SourmashKmerMinHash {
// TODO: at most one of (prot, dayhoff, hp) should be true

let hash_function = if dayhoff {
HashFunctions::murmur64_dayhoff
} else if hp {
HashFunctions::murmur64_hp
} else if prot {
HashFunctions::murmur64_protein
} else {
HashFunctions::murmur64_DNA
};

let mh = KmerMinHash::new(n, k, hash_function, seed, mx, track_abundance);
let mh = KmerMinHash::new(scaled, k, hash_function, seed, track_abundance, n);

SourmashKmerMinHash::from_rust(mh)
}
Expand Down Expand Up @@ -105,7 +91,11 @@ pub unsafe extern "C" fn kmerminhash_add_hash(ptr: *mut SourmashKmerMinHash, h:
}

#[no_mangle]
pub unsafe extern "C" fn kmerminhash_add_hash_with_abundance(ptr: *mut SourmashKmerMinHash, h: u64, abundance: u64) {
pub unsafe extern "C" fn kmerminhash_add_hash_with_abundance(
ptr: *mut SourmashKmerMinHash,
h: u64,
abundance: u64,
) {
let mh = SourmashKmerMinHash::as_rust_mut(ptr);

mh.add_hash_with_abundance(h, abundance);
Expand Down
8 changes: 4 additions & 4 deletions src/core/src/from.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ impl From<MashSketcher> for KmerMinHash {
let values = other.to_vec();

let mut new_mh = KmerMinHash::new(
values.len() as u32,
0,
values.get(0).unwrap().kmer.len() as u32,
HashFunctions::murmur64_DNA,
42,
0,
true,
values.len() as u32,
);

let hash_with_abunds: Vec<(u64, u64)> = values
Expand Down Expand Up @@ -52,7 +52,7 @@ mod test {

#[test]
fn finch_behavior() {
let mut a = KmerMinHash::new(20, 10, HashFunctions::murmur64_DNA, 42, 0, true);
let mut a = KmerMinHash::new(0, 10, HashFunctions::murmur64_DNA, 42, true, 20);
let mut b = MashSketcher::new(20, 10, 42);

let seq = b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA";
Expand Down Expand Up @@ -89,7 +89,7 @@ mod test {

#[test]
fn from_finch() {
let mut a = KmerMinHash::new(20, 10, HashFunctions::murmur64_DNA, 42, 0, true);
let mut a = KmerMinHash::new(0, 10, HashFunctions::murmur64_DNA, 42, true, 20);
let mut b = MashSketcher::new(20, 10, 42);

let seq = b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA";
Expand Down
66 changes: 41 additions & 25 deletions src/core/src/sketch/minhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,18 +60,18 @@ impl TryFrom<&str> for HashFunctions {
}
}

pub fn max_hash_for_scaled(scaled: u64) -> Option<u64> {
pub fn max_hash_for_scaled(scaled: u64) -> u64 {
match scaled {
0 => None,
1 => Some(u64::max_value()),
_ => Some((u64::max_value() as f64 / scaled as f64) as u64),
0 => 0,
1 => u64::max_value(),
_ => (u64::max_value() as f64 / scaled as f64) as u64,
}
}

pub fn scaled_for_max_hash(max_hash: u64) -> u64 {
match max_hash {
0 => 0,
_ => u64::max_value() / max_hash,
_ => (u64::max_value() as f64 / max_hash as f64) as u64,
}
}

Expand Down Expand Up @@ -222,12 +222,12 @@ impl<'de> Deserialize<'de> for KmerMinHash {

impl KmerMinHash {
pub fn new(
num: u32,
scaled: u64,
ksize: u32,
hash_function: HashFunctions,
seed: u64,
max_hash: u64,
track_abundance: bool,
num: u32,
) -> KmerMinHash {
let mins: Vec<u64>;
let abunds: Option<Vec<u64>>;
Expand All @@ -244,6 +244,8 @@ impl KmerMinHash {
abunds = None
}

let max_hash = max_hash_for_scaled(scaled);

KmerMinHash {
num,
ksize,
Expand Down Expand Up @@ -276,6 +278,10 @@ impl KmerMinHash {
self.max_hash
}

pub fn scaled(&self) -> u64 {
scaled_for_max_hash(self.max_hash)
}

pub fn clear(&mut self) {
self.mins.clear();
if let Some(ref mut abunds) = self.abunds {
Expand Down Expand Up @@ -622,12 +628,12 @@ impl KmerMinHash {
self.check_compatible(other)?;

let mut combined_mh = KmerMinHash::new(
self.num,
self.scaled(),
self.ksize,
self.hash_function,
self.seed,
self.max_hash,
self.abunds.is_some(),
self.num,
);

combined_mh.merge(&self)?;
Expand All @@ -648,12 +654,12 @@ impl KmerMinHash {
self.check_compatible(other)?;

let mut combined_mh = KmerMinHash::new(
self.num,
self.scaled(),
self.ksize,
self.hash_function,
self.seed,
self.max_hash,
self.abunds.is_some(),
self.num,
);

combined_mh.merge(&self)?;
Expand Down Expand Up @@ -776,13 +782,15 @@ impl KmerMinHash {

// create a downsampled copy of self
pub fn downsample_max_hash(&self, max_hash: u64) -> Result<KmerMinHash, Error> {
let scaled = scaled_for_max_hash(max_hash);

let mut new_mh = KmerMinHash::new(
self.num,
scaled,
self.ksize,
self.hash_function,
self.seed,
max_hash, // old max_hash => max_hash arg
self.abunds.is_some(),
self.num,
);
if self.abunds.is_some() {
new_mh.add_many_with_abund(&self.to_vec_abunds())?;
Expand Down Expand Up @@ -1453,12 +1461,12 @@ impl<'de> Deserialize<'de> for KmerMinHashBTree {

impl KmerMinHashBTree {
pub fn new(
num: u32,
scaled: u64,
ksize: u32,
hash_function: HashFunctions,
seed: u64,
max_hash: u64,
track_abundance: bool,
num: u32,
) -> KmerMinHashBTree {
let mins = Default::default();

Expand All @@ -1468,6 +1476,8 @@ impl KmerMinHashBTree {
None
};

let max_hash = max_hash_for_scaled(scaled);

KmerMinHashBTree {
num,
ksize,
Expand Down Expand Up @@ -1501,6 +1511,10 @@ impl KmerMinHashBTree {
self.max_hash
}

pub fn scaled(&self) -> u64 {
scaled_for_max_hash(self.max_hash)
}

pub fn clear(&mut self) {
self.mins.clear();
if let Some(ref mut abunds) = self.abunds {
Expand Down Expand Up @@ -1734,12 +1748,12 @@ impl KmerMinHashBTree {
self.check_compatible(other)?;

let mut combined_mh = KmerMinHashBTree::new(
self.num,
self.scaled(),
self.ksize,
self.hash_function,
self.seed,
self.max_hash,
self.abunds.is_some(),
self.num,
);

combined_mh.merge(&self)?;
Expand All @@ -1761,12 +1775,12 @@ impl KmerMinHashBTree {
self.check_compatible(other)?;

let mut combined_mh = KmerMinHashBTree::new(
self.num,
self.scaled(),
self.ksize,
self.hash_function,
self.seed,
self.max_hash,
self.abunds.is_some(),
self.num,
);

combined_mh.merge(&self)?;
Expand Down Expand Up @@ -1878,13 +1892,15 @@ impl KmerMinHashBTree {

// create a downsampled copy of self
pub fn downsample_max_hash(&self, max_hash: u64) -> Result<KmerMinHashBTree, Error> {
let scaled = scaled_for_max_hash(max_hash);

let mut new_mh = KmerMinHashBTree::new(
self.num,
scaled,
self.ksize,
self.hash_function,
self.seed,
max_hash, // old max_hash => max_hash arg
self.abunds.is_some(),
self.num,
);
if self.abunds.is_some() {
new_mh.add_many_with_abund(&self.to_vec_abunds())?;
Expand Down Expand Up @@ -2066,12 +2082,12 @@ impl SigsTrait for KmerMinHashBTree {
impl From<KmerMinHashBTree> for KmerMinHash {
fn from(other: KmerMinHashBTree) -> KmerMinHash {
let mut new_mh = KmerMinHash::new(
other.num(),
other.scaled(),
other.ksize() as u32,
other.hash_function(),
other.seed(),
other.max_hash(),
other.track_abundance(),
other.num(),
);

let mins = other.mins.into_iter().collect();
Expand All @@ -2091,12 +2107,12 @@ impl From<KmerMinHashBTree> for KmerMinHash {
impl From<KmerMinHash> for KmerMinHashBTree {
fn from(other: KmerMinHash) -> KmerMinHashBTree {
let mut new_mh = KmerMinHashBTree::new(
other.num(),
other.scaled(),
other.ksize() as u32,
other.hash_function(),
other.seed(),
other.max_hash(),
other.track_abundance(),
other.num(),
);

let mins: BTreeSet<u64> = other.mins.into_iter().collect();
Expand Down
Loading