Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 40 additions & 6 deletions src/sourmash_plugin_pangenomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import re
import pprint
from difflib import get_close_matches
import gzip

import sourmash
import sourmash_utils
Expand Down Expand Up @@ -156,6 +157,12 @@ def __init__(self, subparser):
required=True,
help="CSV file containing classification of each hash",
)
p.add_argument(
"--gzip",
action="store_true",
help="Maximumly compress the hash classification CSV with gzip!"
)

sourmash_utils.add_standard_minhash_args(p)

def main(self, args):
Expand Down Expand Up @@ -328,7 +335,11 @@ def pangenome_createdb_main(args):

# Chunk function to limit the memory used by the hash_count dict and list
def write_chunk(chunk, output_file):
with open(output_file, "a", newline="") as csvfile:
if output_file.endswith('.gz'):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this looks good - not sure you need the --gzip option after all? It doesn't seem to add much :).

csvfile = gzip.open(output_file, "at", newline="", encoding="utf-8")
else:
csvfile = open(output_file, "a", newline="", encoding="utf-8")
with csvfile:
fieldnames = ["lineage", "sig_name", "hash_count", "genome_count"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerows(chunk)
Expand Down Expand Up @@ -552,13 +563,29 @@ def pangenome_ranktable_main(args):
print(
f"Writing hash classification to CSV file '{args.output_hash_classification}'"
)
with open(args.output_hash_classification, "w", newline="") as fp:
w = csv.writer(fp)
w.writerow(["hashval", "freq", "abund", "max_abund"])
output_file = args.output_hash_classification

for hashval, freq, hash_abund, max_value in frequencies:
w.writerow([hashval, freq, hash_abund, max_value])
use_gzip = (
getattr(args, "gzip", False)
or output_file.endswith(".gz")
)

if use_gzip and not output_file.endswith(".gz"):
output_file += ".gz"

if use_gzip:
with gzip.open(output_file, "wt", newline="", compresslevel=9) as fp:
w = csv.writer(fp)
w.writerow(["hashval", "freq", "abund", "max_abund"])
for hashval, freq, hash_abund, max_value in frequencies:
w.writerow([hashval, freq, hash_abund, max_value])
else:
with open(args.output_hash_classification, "w", newline="") as fp:
w = csv.writer(fp)
w.writerow(["hashval", "freq", "abund", "max_abund"])

for hashval, freq, hash_abund, max_value in frequencies:
w.writerow([hashval, freq, hash_abund, max_value])

#
# pangenome_classify
Expand Down Expand Up @@ -595,6 +622,13 @@ def classify_hashes_main(args):
shell_mh = minhash.copy_and_clear()

# load in all the frequencies etc, and classfy
for csv_file in args.ranktable_csv_files:
if csv_file.endswith('.gz'):
fp = gzip.open(csv_file, "rt", newline="", encoding="utf-8")
else:
fp = open(csv_file, "r", newline="", encoding="utf-8")
with fp:
r = csv.DictReader(fp)
for csv_file in args.ranktable_csv_files:
with open(csv_file, "r", newline="") as fp:
r = csv.DictReader(fp)
Expand Down
Loading