Skip to content

Commit

Permalink
Tests for count and consume with kmer tracking enabled
Browse files Browse the repository at this point in the history
  • Loading branch information
Adamtaranto committed Sep 30, 2024
1 parent dea4f61 commit 61c2a8f
Showing 1 changed file with 165 additions and 28 deletions.
193 changes: 165 additions & 28 deletions src/python/tests/test_kmers_and_hashes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,6 @@
import oxli


# Helper function, create tables.
def create_sample_kmer_table(ksize, kmers):
table = oxli.KmerCountTable(ksize)
for kmer in kmers:
table.count(kmer)
return table


def test_basic():
"string containing only forward canonical kmers."
seq = "ATAAACC" # all forward k-mers
Expand Down Expand Up @@ -72,24 +64,6 @@ def test_basic_lower():
]


# def test_bad_kmers_raise_error():
# "Test that bad k-mers raise a ValueError with info"
# seq = "acxttg"
# cg = oxli.KmerCountTable(ksize=4)
#
# with pytest.raises(ValueError, match="bad k-mer at position 0: ACXT"):
# x = cg.kmers_and_hashes(seq, False)
#
#
# def test_bad_kmers_raise_error_2():
# "Test bad k-mers raise the right error even when not at beginning :)"
# seq = "aattxttgg"
# cg = oxli.KmerCountTable(ksize=4)
#
# with pytest.raises(ValueError, match="bad k-mer at position 1: ATTX"):
# x = cg.kmers_and_hashes(seq, False)


def test_bad_kmers_raise_warning(capfd):
"Test that bad k-mers print warning with info"
seq = "acxttg"
Expand All @@ -100,7 +74,7 @@ def test_bad_kmers_raise_warning(capfd):
captured = capfd.readouterr()

# Check for warning in stderr
assert f"bad k-mer at position 0: ACXT" in captured.err
assert f"bad k-mer at position 1: ACXT" in captured.err


def test_bad_kmers_raise_warning_2(capfd):
Expand All @@ -113,7 +87,7 @@ def test_bad_kmers_raise_warning_2(capfd):
captured = capfd.readouterr()

# Check for warning in stderr
assert f"bad k-mer at position 1: ATTX" in captured.err
assert f"bad k-mer at position 2: ATTX" in captured.err


def test_report_bad_kmers():
Expand Down Expand Up @@ -144,3 +118,166 @@ def test_skip_bad_kmers():
("AATT", 382727017318141683),
("CCAA", 1798905482136869687),
]


# Tests for hash:kmer storage and retreival


def test_count_saves_kmer():
"""Test that count() stores k-mers and their corresponding hashes when store_kmers=True."""
kmer = "AAAA"
cg = oxli.KmerCountTable(ksize=4, store_kmers=True)

# Call count() on a k-mer
count = cg.count(kmer)

# Check that the k-mer was counted
assert count == 1, f"Expected count to be 1 after first insertion, but got {count}"

# Hash value of the k-mer should now exist in the hash_to_kmer map
hashval = cg.hash_kmer(kmer)

# Check that the k-mer is stored correctly in the hash_to_kmer map
stored_kmer = cg.unhash(hashval)
assert (
stored_kmer == kmer
), f"Expected stored k-mer to be {kmer}, but got {stored_kmer}"


def test_count_saves_canonical_kmer():
"""Test that count() stores correct canonical form of k-mers and their corresponding hashes when store_kmers=True."""
cg = oxli.KmerCountTable(ksize=4, store_kmers=True)
kmer = "TTTT"
canon_kmer = "AAAA"

# Call count() on a k-mer
cg.count(kmer)

# Hash value of the k-mer should now exist in the hash_to_kmer map
hashval = cg.hash_kmer(kmer)

# Check that the k-mer is stored correctly in the hash_to_kmer map
stored_kmer = cg.unhash(hashval)

assert (
stored_kmer == canon_kmer
), f"Expected stored k-mer to be {canon_kmer}, but got {stored_kmer}"


def test_consume_saves_kmers():
"""Test that consume() processes a sequence and stores k-mers and their hashes."""
seq = "ACGTTG"
cg = oxli.KmerCountTable(ksize=4, store_kmers=True)

# Consume the sequence, expecting 3 k-mers ("ACGT", "AACG", "CAAC")
n_kmers = cg.consume(seq)

# Check that 3 k-mers were processed
assert n_kmers == 3, f"Expected to consume 3 k-mers, but got {n_kmers}"

# Check that all k-mers are stored in the hash_to_kmer map
for kmer in ["ACGT", "AACG", "CAAC"]:
hashval = cg.hash_kmer(kmer)
stored_kmer = cg.unhash(hashval)
assert (
stored_kmer == kmer
), f"Expected stored k-mer to be {kmer}, but got {stored_kmer}"


def test_count_increments_kmer():
"""Test that count() increments the count of a k-mer when called multiple times."""
kmer = "AAAA"
rev_kmer = "TTTT"
cg = oxli.KmerCountTable(ksize=4, store_kmers=True)

# Call count() twice on the same k-mer
count1 = cg.count(kmer)
count2 = cg.count(rev_kmer)

# Check that the count has incremented
assert (
count1 == 1
), f"Expected count to be 1 after first insertion, but got {count1}"
assert (
count2 == 2
), f"Expected count to be 2 after second insertion, but got {count2}"

# Ensure the k-mer is still stored correctly in hash_to_kmer
hashval = cg.hash_kmer(kmer)
stored_kmer = cg.unhash(hashval)
assert (
stored_kmer == kmer
), f"Expected stored k-mer to be {kmer}, but got {stored_kmer}"


def test_consume_increments_kmers():
"""Test that consume() increments k-mer counts when the same k-mers are encountered."""
sequence = "AAAAACCCC" # Contains overlapping "AAAA" twice
cg = oxli.KmerCountTable(ksize=4, store_kmers=True)

# Consume the sequence, expecting 6 k-mers (AAAA, AAAA, AAAC, AACC, ACCC, CCCC)
n_kmers = cg.consume(sequence)

# Check that 6 k-mers were processed
assert n_kmers == 6, f"Expected to consume 6 k-mers, but got {n_kmers}"

# Check that the count for "AAAA" is now 2
assert cg.get("AAAA") == 2, "Expected count for 'AAAA' to be 2"


def test_unhash_invalid_kmer():
"""Test that unhash() raises an error when given an invalid hash."""
cg = oxli.KmerCountTable(ksize=4, store_kmers=True)
cg.count("AAAA")

invalid_hash = 1234567890 # A hash that doesn't exist

# Expecting an exception when trying to unhash an invalid value
with pytest.raises(
KeyError, match=f"Warning: Hash {invalid_hash} not found in table."
):
cg.unhash(invalid_hash)


def test_unhash_no_kmer_table():
"""Test that unhash() raises an error when used on a count table without kmer tracking."""
cg = oxli.KmerCountTable(ksize=3, store_kmers=False)
kmer = "AAA"
cg.count(kmer)

real_hash = cg.hash_kmer(kmer)

# Expecting an exception when trying to unhash an invalid value
with pytest.raises(ValueError, match="K-mer storage is not enabled."):
cg.unhash(real_hash)


def test_consume_invalid_kmers(capfd):
"""Test that consume() processes a sequence and stores k-mers and their hashes."""
seq = "XAAAAAXGGGG"
cg = oxli.KmerCountTable(ksize=3, store_kmers=True)

# Consume the sequence, expecting 5 k-mers ("AAA", "AAA", "AAA", "GGG", "GGG")
n_kmers = cg.consume(seq) # [(10679328328772601858, 3), (12126843654075378313, 2)]
# Capture stderr warnings for bad kmers
captured = capfd.readouterr()

# Check for warnings in stderr
assert "bad k-mer at position 1: XAA" in captured.err
assert "bad k-mer at position 5: AAX" in captured.err
assert "bad k-mer at position 6: AXG" in captured.err
assert "bad k-mer at position 7: XGG" in captured.err

# Check that 5 k-mers were processed
assert n_kmers == 5, f"Expected to consume 2 k-mers, but got {n_kmers}"

# Check 2 distinct kmers
assert len(cg) == 2, "Expected exactly 2 distinct kmers"

# Check that all k-mers are stored in the hash_to_kmer map
for kmer in ["AAA", "CCC"]:
hashval = cg.hash_kmer(kmer)
stored_kmer = cg.unhash(hashval)
assert (
stored_kmer == kmer
), f"Expected stored k-mer to be {kmer}, but got {stored_kmer}"

0 comments on commit 61c2a8f

Please sign in to comment.