Skip to content

Commit

Permalink
Replace mx by scaled (#1139)
Browse files Browse the repository at this point in the history
* Replace mx by scaled
* Change kmerminhash_new signature
* fix small issues
* Change to hash_function
* Fix some test issues
* update scaled_for_max_hash
* Remove extra space
* Fix some wasm issues
* fix lints
* bump rust version
  • Loading branch information
xmnlab authored Oct 8, 2020
1 parent 7a8e5ac commit 5e8b66a
Show file tree
Hide file tree
Showing 10 changed files with 149 additions and 126 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,6 @@ Cargo.lock
pkg/
wasm-pack.log
.hypothesis

# IDEs
.vscode/
10 changes: 4 additions & 6 deletions include/sourmash.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,14 +174,12 @@ SourmashStr kmerminhash_md5sum(const SourmashKmerMinHash *ptr);

void kmerminhash_merge(SourmashKmerMinHash *ptr, const SourmashKmerMinHash *other);

SourmashKmerMinHash *kmerminhash_new(uint32_t n,
SourmashKmerMinHash *kmerminhash_new(uint64_t scaled,
uint32_t k,
bool prot,
bool dayhoff,
bool hp,
HashFunctions hash_function,
uint64_t seed,
uint64_t mx,
bool track_abundance);
bool track_abundance,
uint32_t n);

uint32_t kmerminhash_num(const SourmashKmerMinHash *ptr);

Expand Down
34 changes: 28 additions & 6 deletions sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,20 @@ def _get_max_hash_for_scaled(scaled):
elif scaled == 1:
return get_minhash_max_hash()

return int(round(get_minhash_max_hash() / scaled, 0))
return min(
int(round(get_minhash_max_hash() / scaled, 0)),
MINHASH_MAX_HASH
)


def _get_scaled_for_max_hash(max_hash):
"Convert a 'max_hash' value into a 'scaled' value."
if max_hash == 0:
return 0
return int(round(get_minhash_max_hash() / max_hash, 0))
return min(
int(round(get_minhash_max_hash() / max_hash, 0)),
MINHASH_MAX_HASH
)


def to_bytes(s):
Expand Down Expand Up @@ -179,10 +185,17 @@ def __init__(
if dayhoff or hp:
is_protein = False

# ok, for Rust API, go from scaled back to max_hash
max_hash = _get_max_hash_for_scaled(scaled)
if dayhoff:
hash_function = lib.HASH_FUNCTIONS_MURMUR64_DAYHOFF
elif hp:
hash_function = lib.HASH_FUNCTIONS_MURMUR64_HP
elif is_protein:
hash_function = lib.HASH_FUNCTIONS_MURMUR64_PROTEIN
else:
hash_function = lib.HASH_FUNCTIONS_MURMUR64_DNA

self._objptr = lib.kmerminhash_new(
n, ksize, is_protein, dayhoff, hp, seed, int(max_hash), track_abundance
scaled, ksize, hash_function, seed, track_abundance, n
)

if mins:
Expand Down Expand Up @@ -227,8 +240,17 @@ def __setstate__(self, tup):
max_hash, seed) = tup

self.__del__()

hash_function = (
lib.HASH_FUNCTIONS_MURMUR64_DAYHOFF if dayhoff else
lib.HASH_FUNCTIONS_MURMUR64_HP if hp else
lib.HASH_FUNCTIONS_MURMUR64_PROTEIN if is_protein else
lib.HASH_FUNCTIONS_MURMUR64_DNA
)

scaled = _get_scaled_for_max_hash(max_hash)
self._objptr = lib.kmerminhash_new(
n, ksize, is_protein, dayhoff, hp, seed, max_hash, track_abundance
scaled, ksize, hash_function, seed, track_abundance, n
)
if track_abundance:
self.set_abundances(mins)
Expand Down
4 changes: 2 additions & 2 deletions src/core/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "sourmash"
version = "0.9.0"
version = "0.10.0"
authors = ["Luiz Irber <[email protected]>"]
description = "MinHash sketches for genomic data"
repository = "https://github.com/dib-lab/sourmash"
Expand All @@ -24,7 +24,7 @@ parallel = ["rayon"]
[dependencies]
backtrace = "=0.3.46" # later versions require rust 1.40
byteorder = "1.3.4"
cfg-if = "0.1.10"
cfg-if = "1.0"
failure = "0.1.8" # can remove after .backtrace() is available in std::error::Error
finch = { version = "0.3.0", optional = true }
fixedbitset = "0.3.0"
Expand Down
2 changes: 1 addition & 1 deletion src/core/src/cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ impl Default for ComputeParameters {
}

pub fn build_template(params: &ComputeParameters) -> Vec<Sketch> {
let max_hash = max_hash_for_scaled(params.scaled).unwrap_or(0);
let max_hash = max_hash_for_scaled(params.scaled);

params
.ksizes
Expand Down
30 changes: 10 additions & 20 deletions src/core/src/ffi/minhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,14 @@ impl ForeignObject for SourmashKmerMinHash {

#[no_mangle]
pub unsafe extern "C" fn kmerminhash_new(
n: u32,
scaled: u64,
k: u32,
prot: bool,
dayhoff: bool,
hp: bool,
hash_function: HashFunctions,
seed: u64,
mx: u64,
track_abundance: bool,
n: u32,
) -> *mut SourmashKmerMinHash {
// TODO: at most one of (prot, dayhoff, hp) should be true

let hash_function = if dayhoff {
HashFunctions::murmur64_dayhoff
} else if hp {
HashFunctions::murmur64_hp
} else if prot {
HashFunctions::murmur64_protein
} else {
HashFunctions::murmur64_DNA
};

let mh = KmerMinHash::new(n, k, hash_function, seed, mx, track_abundance);
let mh = KmerMinHash::new(scaled, k, hash_function, seed, track_abundance, n);

SourmashKmerMinHash::from_rust(mh)
}
Expand Down Expand Up @@ -105,7 +91,11 @@ pub unsafe extern "C" fn kmerminhash_add_hash(ptr: *mut SourmashKmerMinHash, h:
}

#[no_mangle]
pub unsafe extern "C" fn kmerminhash_add_hash_with_abundance(ptr: *mut SourmashKmerMinHash, h: u64, abundance: u64) {
pub unsafe extern "C" fn kmerminhash_add_hash_with_abundance(
ptr: *mut SourmashKmerMinHash,
h: u64,
abundance: u64,
) {
let mh = SourmashKmerMinHash::as_rust_mut(ptr);

mh.add_hash_with_abundance(h, abundance);
Expand Down Expand Up @@ -259,7 +249,7 @@ unsafe fn kmerminhash_set_abundances(
};

let mut pairs: Vec<_> = hashes.iter().cloned().zip(abunds.iter().cloned()).collect();
pairs.sort();
pairs.sort_unstable();

// Reset the minhash
if clear {
Expand Down
8 changes: 4 additions & 4 deletions src/core/src/from.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ impl From<MashSketcher> for KmerMinHash {
let values = other.to_vec();

let mut new_mh = KmerMinHash::new(
values.len() as u32,
0,
values.get(0).unwrap().kmer.len() as u32,
HashFunctions::murmur64_DNA,
42,
0,
true,
values.len() as u32,
);

let hash_with_abunds: Vec<(u64, u64)> = values
Expand Down Expand Up @@ -52,7 +52,7 @@ mod test {

#[test]
fn finch_behavior() {
let mut a = KmerMinHash::new(20, 10, HashFunctions::murmur64_DNA, 42, 0, true);
let mut a = KmerMinHash::new(0, 10, HashFunctions::murmur64_DNA, 42, true, 20);
let mut b = MashSketcher::new(20, 10, 42);

let seq = b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA";
Expand Down Expand Up @@ -89,7 +89,7 @@ mod test {

#[test]
fn from_finch() {
let mut a = KmerMinHash::new(20, 10, HashFunctions::murmur64_DNA, 42, 0, true);
let mut a = KmerMinHash::new(0, 10, HashFunctions::murmur64_DNA, 42, true, 20);
let mut b = MashSketcher::new(20, 10, 42);

let seq = b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA";
Expand Down
Loading

0 comments on commit 5e8b66a

Please sign in to comment.