Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace mx by scaled #1139

Merged
merged 11 commits into from
Oct 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,6 @@ Cargo.lock
pkg/
wasm-pack.log
.hypothesis

# IDEs
.vscode/
10 changes: 4 additions & 6 deletions include/sourmash.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,14 +174,12 @@ SourmashStr kmerminhash_md5sum(const SourmashKmerMinHash *ptr);

void kmerminhash_merge(SourmashKmerMinHash *ptr, const SourmashKmerMinHash *other);

SourmashKmerMinHash *kmerminhash_new(uint32_t n,
SourmashKmerMinHash *kmerminhash_new(uint64_t scaled,
uint32_t k,
bool prot,
bool dayhoff,
bool hp,
HashFunctions hash_function,
uint64_t seed,
uint64_t mx,
bool track_abundance);
bool track_abundance,
uint32_t n);

uint32_t kmerminhash_num(const SourmashKmerMinHash *ptr);

Expand Down
34 changes: 28 additions & 6 deletions sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,20 @@ def _get_max_hash_for_scaled(scaled):
elif scaled == 1:
return get_minhash_max_hash()

return int(round(get_minhash_max_hash() / scaled, 0))
return min(
int(round(get_minhash_max_hash() / scaled, 0)),
MINHASH_MAX_HASH
)


def _get_scaled_for_max_hash(max_hash):
"Convert a 'max_hash' value into a 'scaled' value."
if max_hash == 0:
return 0
return int(round(get_minhash_max_hash() / max_hash, 0))
return min(
int(round(get_minhash_max_hash() / max_hash, 0)),
MINHASH_MAX_HASH
)


def to_bytes(s):
Expand Down Expand Up @@ -179,10 +185,17 @@ def __init__(
if dayhoff or hp:
is_protein = False

# ok, for Rust API, go from scaled back to max_hash
max_hash = _get_max_hash_for_scaled(scaled)
if dayhoff:
hash_function = lib.HASH_FUNCTIONS_MURMUR64_DAYHOFF
elif hp:
hash_function = lib.HASH_FUNCTIONS_MURMUR64_HP
elif is_protein:
hash_function = lib.HASH_FUNCTIONS_MURMUR64_PROTEIN
else:
hash_function = lib.HASH_FUNCTIONS_MURMUR64_DNA

self._objptr = lib.kmerminhash_new(
n, ksize, is_protein, dayhoff, hp, seed, int(max_hash), track_abundance
scaled, ksize, hash_function, seed, track_abundance, n
)

if mins:
Expand Down Expand Up @@ -227,8 +240,17 @@ def __setstate__(self, tup):
max_hash, seed) = tup

self.__del__()

hash_function = (
lib.HASH_FUNCTIONS_MURMUR64_DAYHOFF if dayhoff else
lib.HASH_FUNCTIONS_MURMUR64_HP if hp else
lib.HASH_FUNCTIONS_MURMUR64_PROTEIN if is_protein else
lib.HASH_FUNCTIONS_MURMUR64_DNA
Comment on lines +245 to +248
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it makes sense that this works, but I've never seen it being used this way =]

)

scaled = _get_scaled_for_max_hash(max_hash)
self._objptr = lib.kmerminhash_new(
n, ksize, is_protein, dayhoff, hp, seed, max_hash, track_abundance
scaled, ksize, hash_function, seed, track_abundance, n
)
if track_abundance:
self.set_abundances(mins)
Expand Down
4 changes: 2 additions & 2 deletions src/core/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "sourmash"
version = "0.9.0"
version = "0.10.0"
authors = ["Luiz Irber <[email protected]>"]
description = "MinHash sketches for genomic data"
repository = "https://github.com/dib-lab/sourmash"
Expand All @@ -24,7 +24,7 @@ parallel = ["rayon"]
[dependencies]
backtrace = "=0.3.46" # later versions require rust 1.40
byteorder = "1.3.4"
cfg-if = "0.1.10"
cfg-if = "1.0"
failure = "0.1.8" # can remove after .backtrace() is available in std::error::Error
finch = { version = "0.3.0", optional = true }
fixedbitset = "0.3.0"
Expand Down
2 changes: 1 addition & 1 deletion src/core/src/cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ impl Default for ComputeParameters {
}

pub fn build_template(params: &ComputeParameters) -> Vec<Sketch> {
let max_hash = max_hash_for_scaled(params.scaled).unwrap_or(0);
let max_hash = max_hash_for_scaled(params.scaled);

params
.ksizes
Expand Down
30 changes: 10 additions & 20 deletions src/core/src/ffi/minhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,14 @@ impl ForeignObject for SourmashKmerMinHash {

#[no_mangle]
pub unsafe extern "C" fn kmerminhash_new(
n: u32,
scaled: u64,
k: u32,
prot: bool,
dayhoff: bool,
hp: bool,
hash_function: HashFunctions,
seed: u64,
mx: u64,
track_abundance: bool,
n: u32,
) -> *mut SourmashKmerMinHash {
// TODO: at most one of (prot, dayhoff, hp) should be true

let hash_function = if dayhoff {
HashFunctions::murmur64_dayhoff
} else if hp {
HashFunctions::murmur64_hp
} else if prot {
HashFunctions::murmur64_protein
} else {
HashFunctions::murmur64_DNA
};

let mh = KmerMinHash::new(n, k, hash_function, seed, mx, track_abundance);
let mh = KmerMinHash::new(scaled, k, hash_function, seed, track_abundance, n);

SourmashKmerMinHash::from_rust(mh)
}
Expand Down Expand Up @@ -105,7 +91,11 @@ pub unsafe extern "C" fn kmerminhash_add_hash(ptr: *mut SourmashKmerMinHash, h:
}

#[no_mangle]
pub unsafe extern "C" fn kmerminhash_add_hash_with_abundance(ptr: *mut SourmashKmerMinHash, h: u64, abundance: u64) {
pub unsafe extern "C" fn kmerminhash_add_hash_with_abundance(
ptr: *mut SourmashKmerMinHash,
h: u64,
abundance: u64,
) {
let mh = SourmashKmerMinHash::as_rust_mut(ptr);

mh.add_hash_with_abundance(h, abundance);
Expand Down Expand Up @@ -259,7 +249,7 @@ unsafe fn kmerminhash_set_abundances(
};

let mut pairs: Vec<_> = hashes.iter().cloned().zip(abunds.iter().cloned()).collect();
pairs.sort();
pairs.sort_unstable();

// Reset the minhash
if clear {
Expand Down
8 changes: 4 additions & 4 deletions src/core/src/from.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ impl From<MashSketcher> for KmerMinHash {
let values = other.to_vec();

let mut new_mh = KmerMinHash::new(
values.len() as u32,
0,
values.get(0).unwrap().kmer.len() as u32,
HashFunctions::murmur64_DNA,
42,
0,
true,
values.len() as u32,
);

let hash_with_abunds: Vec<(u64, u64)> = values
Expand Down Expand Up @@ -52,7 +52,7 @@ mod test {

#[test]
fn finch_behavior() {
let mut a = KmerMinHash::new(20, 10, HashFunctions::murmur64_DNA, 42, 0, true);
let mut a = KmerMinHash::new(0, 10, HashFunctions::murmur64_DNA, 42, true, 20);
let mut b = MashSketcher::new(20, 10, 42);

let seq = b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA";
Expand Down Expand Up @@ -89,7 +89,7 @@ mod test {

#[test]
fn from_finch() {
let mut a = KmerMinHash::new(20, 10, HashFunctions::murmur64_DNA, 42, 0, true);
let mut a = KmerMinHash::new(0, 10, HashFunctions::murmur64_DNA, 42, true, 20);
let mut b = MashSketcher::new(20, 10, 42);

let seq = b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA";
Expand Down
Loading