Skip to content

Commit

Permalink
use parse_fastx_stdin, add stdin test, fix tests checking 0 hashes (#573
Browse files Browse the repository at this point in the history
)
  • Loading branch information
bluegenes authored Jan 6, 2025
1 parent 1687b30 commit fc9c124
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 12 deletions.
52 changes: 43 additions & 9 deletions src/python/tests/test_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import csv
import pandas
import sourmash
import subprocess
from sourmash import index
import io
from . import sourmash_tst_utils as utils
Expand Down Expand Up @@ -1159,7 +1160,7 @@ def test_singlesketch_simple(runtmp):
output = runtmp.output("short.sig")

# Run the singlesketch command
runtmp.sourmash("scripts", "singlesketch", fa1, "-o", output)
runtmp.sourmash("scripts", "singlesketch", fa1, "-o", output, "-p", "scaled=10")

# Check if the output exists and contains the expected data
assert os.path.exists(output)
Expand All @@ -1168,11 +1169,12 @@ def test_singlesketch_simple(runtmp):
assert sig.name == "short.fa"
assert sig.minhash.ksize == 31
assert sig.minhash.is_dna
assert sig.minhash.scaled == 1000
assert sig.minhash.scaled == 10
print("HASHES", sig.minhash.hashes)

# validate against sourmash sketch
output2 = runtmp.output("short2.sig")
runtmp.sourmash("sketch", "dna", fa1, "-o", output2)
runtmp.sourmash("sketch", "dna", fa1, "-o", output2, "-p", "scaled=10")
sig2 = sourmash.load_one_signature(output2)
assert sig.minhash.hashes == sig2.minhash.hashes

Expand Down Expand Up @@ -1301,6 +1303,7 @@ def test_singlesketch_protein_moltype(runtmp):
assert sig.minhash.ksize == 10
assert sig.minhash.is_protein
assert sig.minhash.scaled == 100
print("HASHES:", sig.minhash.hashes)

# validate against sourmash sketch
output2 = runtmp.output("short2.sig")
Expand Down Expand Up @@ -1381,7 +1384,7 @@ def test_singlesketch_gzipped_output(runtmp):
output = runtmp.output("short.sig.gz")

# Run the singlesketch command
runtmp.sourmash("scripts", "singlesketch", fa1, "-o", output)
runtmp.sourmash("scripts", "singlesketch", fa1, "-o", output, "-p", "scaled=10")

# Check if the output exists and contains the expected data
assert os.path.exists(output)
Expand All @@ -1397,15 +1400,16 @@ def test_singlesketch_gzipped_output(runtmp):

# check the signatures
sig = sourmash.load_one_signature(output)
print("HASHES:", sig.minhash.hashes)

assert sig.name == "short.fa"
assert sig.minhash.ksize == 31
assert sig.minhash.is_dna
assert sig.minhash.scaled == 1000
assert sig.minhash.scaled == 10

# validate against sourmash sketch
output2 = runtmp.output("short2.sig")
runtmp.sourmash("sketch", "dna", fa1, "-o", output2)
runtmp.sourmash("sketch", "dna", fa1, "-o", output2, "-p", "scaled=10")
sig2 = sourmash.load_one_signature(output2)
assert sig.minhash.hashes == sig2.minhash.hashes

Expand All @@ -1416,7 +1420,7 @@ def test_singlesketch_zip_output(runtmp):
output = runtmp.output("short.zip")

# Run the singlesketch command
runtmp.sourmash("scripts", "singlesketch", fa1, "-o", output)
runtmp.sourmash("scripts", "singlesketch", fa1, "-o", output, "-p", "scaled=10")

# Check if the output exists and contains the expected data
assert os.path.exists(output)
Expand All @@ -1425,15 +1429,16 @@ def test_singlesketch_zip_output(runtmp):
assert len(sigs) == 1
print(sigs)
sig = sigs[0]
print("HASHES:", sig.minhash.hashes)

assert sig.name == "short.fa"
assert sig.minhash.ksize == 31
assert sig.minhash.is_dna
assert sig.minhash.scaled == 1000
assert sig.minhash.scaled == 10

# validate against sourmash sketch
output2 = runtmp.output("short2.sig")
runtmp.sourmash("sketch", "dna", fa1, "-o", output2)
runtmp.sourmash("sketch", "dna", fa1, "-o", output2, "-p", "scaled=10")
sig2 = sourmash.load_one_signature(output2)
assert sig.minhash.hashes == sig2.minhash.hashes

Expand Down Expand Up @@ -1614,3 +1619,32 @@ def test_singlesketch_skipm2n3(runtmp):
assert (
data[0]["name"] == expected["name"]
), f"Unexpected name: {data[0]['name']}"


def test_singlesketch_stdin(runtmp):
"""Test basic single sketching with default parameters."""
fa1 = get_test_data("short.fa")
output = runtmp.output("short.sig")

# Run the singlesketch command using subprocess
cmd = f"cat {fa1} | sourmash scripts singlesketch - --name short -o {output} -p dna,scaled=10"
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)

# Check if the command succeeded
assert result.returncode == 0, f"Command failed: {result.stderr}"

# Check if the output exists and contains the expected data
assert os.path.exists(output)
sig = sourmash.load_one_signature(output)

assert sig.name == "short"
assert sig.minhash.ksize == 31
assert sig.minhash.is_dna
assert sig.minhash.scaled == 10
print("HASHES:", sig.minhash.hashes)

# validate against sourmash sketch
output2 = runtmp.output("short2.sig")
runtmp.sourmash("sketch", "dna", fa1, "-o", output2, "-p", "dna,scaled=10")
sig2 = sourmash.load_one_signature(output2)
assert sig.minhash.hashes == sig2.minhash.hashes
5 changes: 2 additions & 3 deletions src/utils/buildutils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use anyhow::{anyhow, Context, Result};
use camino::Utf8PathBuf;
use getset::{Getters, Setters};
use needletail::parser::SequenceRecord;
use needletail::{parse_fastx_file, parse_fastx_reader};
use needletail::{parse_fastx_file, parse_fastx_reader, parse_fastx_stdin};
use serde::Serialize;
use sourmash::cmd::ComputeParameters;
use sourmash::encodings::{HashFunctions, Idx};
Expand Down Expand Up @@ -835,8 +835,7 @@ impl BuildCollection {
) -> Result<u64> {
// Create a FASTX reader from the file or stdin
let mut fastx_reader = if filename == "-" {
let stdin = std::io::stdin();
parse_fastx_reader(stdin).context("Failed to parse FASTA/FASTQ data from stdin")?
parse_fastx_stdin().context("Failed to parse FASTA/FASTQ data from stdin")?
} else {
parse_fastx_file(&filename).context("Failed to open file for FASTA/FASTQ data")?
};
Expand Down

0 comments on commit fc9c124

Please sign in to comment.