Skip to content

Commit

Permalink
Merge branch 'main' into dependabot/cargo/tempfile-3.15.0
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb authored Jan 6, 2025
2 parents 4b58ec8 + 7feb3d5 commit 6f6e1b2
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 17 deletions.
60 changes: 47 additions & 13 deletions src/python/tests/test_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import csv
import pandas
import sourmash
import subprocess
from sourmash import index
import io
from . import sourmash_tst_utils as utils
Expand Down Expand Up @@ -572,10 +573,10 @@ def test_zip_manifest(runtmp, capfd):
assert len(manifest) == len(rows)
assert len(manifest) == 3

md5_list = [row["md5"] for row in manifest.rows]
assert "9191284a3a23a913d8d410f3d53ce8f0" in md5_list
assert "d663bb55b2a0f8782c53c8af89f20fff" in md5_list
assert "bf752903d635b1eb83c53fe4aae951db" in md5_list
md5_nhashes = [(row["md5"], row["n_hashes"]) for row in manifest.rows]
assert ("9191284a3a23a913d8d410f3d53ce8f0", 970) in md5_nhashes
assert ("d663bb55b2a0f8782c53c8af89f20fff", 925) in md5_nhashes
assert ("bf752903d635b1eb83c53fe4aae951db", 955) in md5_nhashes

for sig in siglist:
assert sig in manifest
Expand Down Expand Up @@ -1159,7 +1160,7 @@ def test_singlesketch_simple(runtmp):
output = runtmp.output("short.sig")

# Run the singlesketch command
runtmp.sourmash("scripts", "singlesketch", fa1, "-o", output)
runtmp.sourmash("scripts", "singlesketch", fa1, "-o", output, "-p", "scaled=10")

# Check if the output exists and contains the expected data
assert os.path.exists(output)
Expand All @@ -1168,11 +1169,12 @@ def test_singlesketch_simple(runtmp):
assert sig.name == "short.fa"
assert sig.minhash.ksize == 31
assert sig.minhash.is_dna
assert sig.minhash.scaled == 1000
assert sig.minhash.scaled == 10
print("HASHES", sig.minhash.hashes)

# validate against sourmash sketch
output2 = runtmp.output("short2.sig")
runtmp.sourmash("sketch", "dna", fa1, "-o", output2)
runtmp.sourmash("sketch", "dna", fa1, "-o", output2, "-p", "scaled=10")
sig2 = sourmash.load_one_signature(output2)
assert sig.minhash.hashes == sig2.minhash.hashes

Expand Down Expand Up @@ -1301,6 +1303,7 @@ def test_singlesketch_protein_moltype(runtmp):
assert sig.minhash.ksize == 10
assert sig.minhash.is_protein
assert sig.minhash.scaled == 100
print("HASHES:", sig.minhash.hashes)

# validate against sourmash sketch
output2 = runtmp.output("short2.sig")
Expand Down Expand Up @@ -1381,7 +1384,7 @@ def test_singlesketch_gzipped_output(runtmp):
output = runtmp.output("short.sig.gz")

# Run the singlesketch command
runtmp.sourmash("scripts", "singlesketch", fa1, "-o", output)
runtmp.sourmash("scripts", "singlesketch", fa1, "-o", output, "-p", "scaled=10")

# Check if the output exists and contains the expected data
assert os.path.exists(output)
Expand All @@ -1397,15 +1400,16 @@ def test_singlesketch_gzipped_output(runtmp):

# check the signatures
sig = sourmash.load_one_signature(output)
print("HASHES:", sig.minhash.hashes)

assert sig.name == "short.fa"
assert sig.minhash.ksize == 31
assert sig.minhash.is_dna
assert sig.minhash.scaled == 1000
assert sig.minhash.scaled == 10

# validate against sourmash sketch
output2 = runtmp.output("short2.sig")
runtmp.sourmash("sketch", "dna", fa1, "-o", output2)
runtmp.sourmash("sketch", "dna", fa1, "-o", output2, "-p", "scaled=10")
sig2 = sourmash.load_one_signature(output2)
assert sig.minhash.hashes == sig2.minhash.hashes

Expand All @@ -1416,7 +1420,7 @@ def test_singlesketch_zip_output(runtmp):
output = runtmp.output("short.zip")

# Run the singlesketch command
runtmp.sourmash("scripts", "singlesketch", fa1, "-o", output)
runtmp.sourmash("scripts", "singlesketch", fa1, "-o", output, "-p", "scaled=10")

# Check if the output exists and contains the expected data
assert os.path.exists(output)
Expand All @@ -1425,15 +1429,16 @@ def test_singlesketch_zip_output(runtmp):
assert len(sigs) == 1
print(sigs)
sig = sigs[0]
print("HASHES:", sig.minhash.hashes)

assert sig.name == "short.fa"
assert sig.minhash.ksize == 31
assert sig.minhash.is_dna
assert sig.minhash.scaled == 1000
assert sig.minhash.scaled == 10

# validate against sourmash sketch
output2 = runtmp.output("short2.sig")
runtmp.sourmash("sketch", "dna", fa1, "-o", output2)
runtmp.sourmash("sketch", "dna", fa1, "-o", output2, "-p", "scaled=10")
sig2 = sourmash.load_one_signature(output2)
assert sig.minhash.hashes == sig2.minhash.hashes

Expand Down Expand Up @@ -1614,3 +1619,32 @@ def test_singlesketch_skipm2n3(runtmp):
assert (
data[0]["name"] == expected["name"]
), f"Unexpected name: {data[0]['name']}"


def test_singlesketch_stdin(runtmp):
"""Test basic single sketching with default parameters."""
fa1 = get_test_data("short.fa")
output = runtmp.output("short.sig")

# Run the singlesketch command using subprocess
cmd = f"cat {fa1} | sourmash scripts singlesketch - --name short -o {output} -p dna,scaled=10"
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)

# Check if the command succeeded
assert result.returncode == 0, f"Command failed: {result.stderr}"

# Check if the output exists and contains the expected data
assert os.path.exists(output)
sig = sourmash.load_one_signature(output)

assert sig.name == "short"
assert sig.minhash.ksize == 31
assert sig.minhash.is_dna
assert sig.minhash.scaled == 10
print("HASHES:", sig.minhash.hashes)

# validate against sourmash sketch
output2 = runtmp.output("short2.sig")
runtmp.sourmash("sketch", "dna", fa1, "-o", output2, "-p", "dna,scaled=10")
sig2 = sourmash.load_one_signature(output2)
assert sig.minhash.hashes == sig2.minhash.hashes
8 changes: 4 additions & 4 deletions src/utils/buildutils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ use anyhow::{anyhow, Context, Result};
use camino::Utf8PathBuf;
use getset::{Getters, Setters};
use needletail::parser::SequenceRecord;
use needletail::{parse_fastx_file, parse_fastx_reader};
use needletail::{parse_fastx_file, parse_fastx_reader, parse_fastx_stdin};
use serde::Serialize;
use sourmash::cmd::ComputeParameters;
use sourmash::encodings::{HashFunctions, Idx};
use sourmash::errors::SourmashError;
use sourmash::manifest::Record;
use sourmash::selection::Selection;
use sourmash::signature::Signature;
use sourmash::signature::SigsTrait;
use std::collections::HashMap;
use std::collections::HashSet;
use std::fmt::Display;
Expand Down Expand Up @@ -835,8 +836,7 @@ impl BuildCollection {
) -> Result<u64> {
// Create a FASTX reader from the file or stdin
let mut fastx_reader = if filename == "-" {
let stdin = std::io::stdin();
parse_fastx_reader(stdin).context("Failed to parse FASTA/FASTQ data from stdin")?
parse_fastx_stdin().context("Failed to parse FASTA/FASTQ data from stdin")?
} else {
parse_fastx_file(&filename).context("Failed to open file for FASTA/FASTQ data")?
};
Expand Down Expand Up @@ -889,7 +889,7 @@ impl BuildCollection {
record.set_filename(Some(filename.clone()));
record.set_md5(Some(sig.md5sum()));
record.set_md5short(Some(sig.md5sum()[0..8].into()));
record.set_n_hashes(Some(sig.size()));
record.set_n_hashes(Some(sig.minhash().map(|mh| mh.size()).unwrap_or(0)));

// note, this needs to be set when writing sigs (not here)
// record.set_internal_location("")
Expand Down

0 comments on commit 6f6e1b2

Please sign in to comment.