Skip to content

Commit

Permalink
[MRG] add code to support skipping genomes by ident (#255)
Browse files Browse the repository at this point in the history
* add code to support skipping genomes by ident

* add test

* add 'skip_genomes' docs and tests

* fix a few things
  • Loading branch information
ctb authored Dec 6, 2022
1 parent ec4ab0d commit fc3dbeb
Show file tree
Hide file tree
Showing 7 changed files with 115 additions and 6 deletions.
10 changes: 10 additions & 0 deletions doc/configuring.md
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,16 @@ prevent_sra_download: false
# picklist: some_sig_list.csv:ident:ident
picklist: ""

# skip_genomes: identifiers to ignore when they show up in gather output.
# This is useful when the sourmash database contains genomes that are no
# longer present in GenBank because they have been deprecated or suppressed.
#
# Note, in such cases you should try to find a new genome to include in
# a local database!
#
# DEFAULT: []
skip_genomes: []

# sourmash_database_threshold_bp: sets the --threshold-bp minimum match
# size of sourmash prefetch and gather. Matches with smaller overlaps
# than this will be excluded from consideration.
Expand Down
22 changes: 16 additions & 6 deletions genome_grist/conf/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ def make_param_str(ksizes, scaled):
ks = ",".join(ks)
return f"{ks},scaled={scaled},abund"

IGNORE_IDENTS = config.get('skip_genomes', [])

###
#
# print out unknown config variables...
Expand All @@ -113,6 +115,7 @@ known_config_keys = { 'samples', 'outdir', 'tempdir',
'prefetch_memory',
'taxonomies',
'prevent_sra_download',
'skip_genomes',
'picklist' }

unknown_config_keys = all_config_keys - known_config_keys
Expand Down Expand Up @@ -262,6 +265,8 @@ class Checkpoint_GatherResults:
genome_idents = []
for row in load_gather_csv(gather_csv):
ident = row['name'].split()[0]
if ident in IGNORE_IDENTS:
continue
genome_idents.append(ident)

return genome_idents
Expand Down Expand Up @@ -1047,7 +1052,8 @@ rule download_matching_genome_wc:
input:
csvfile = ancient(f'{GENBANK_CACHE}/{{ident}}.info.csv')
output:
genome = f"{GENBANK_CACHE}/{{ident}}_genomic.fna.gz"
genome = f"{GENBANK_CACHE}/{{ident}}_genomic.fna.gz",
retries: 2
run:
rows = list(load_csv(input.csvfile))
assert len(rows) == 1
Expand All @@ -1060,11 +1066,15 @@ rule download_matching_genome_wc:
print(f"downloading genome for ident {ident}/{name} from NCBI...",
file=sys.stderr)
with open(output.genome, 'wb') as outfp:
with urllib.request.urlopen(url) as response:
content = response.read()
outfp.write(content)
print(f"...wrote {len(content)} bytes to {output.genome}",
file=sys.stderr)
try:
with urllib.request.urlopen(url) as response:
content = response.read()
outfp.write(content)
print(f"...wrote {len(content)} bytes to {output.genome}",
file=sys.stderr)
except urllib.request.HTTPError:
print(f"Cannot download genome from URL:\n {url}\nIs it missing? If so, consider adding '{ident}' to 'skip_genomes' list in config file.", file=sys.stderr)
raise Exception("Genbank genome not found")

# summarize_reads_info
rule summarize_reads_info_wc:
Expand Down
Binary file not shown.
Binary file added tests/test-data/GCF_000020205.1.sig.zip
Binary file not shown.
10 changes: 10 additions & 0 deletions tests/test-data/conf-missing-skip.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
samples:
- GCF_000020205-is-missing
outdir: outputs.missing
sourmash_databases:
- tests/test-data/GCF_000020205.1.sig.zip

prevent_sra_download: true

skip_genomes:
- GCF_000020205.1
9 changes: 9 additions & 0 deletions tests/test-data/conf-missing.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
samples:
- GCF_000020205-is-missing
outdir: outputs.missing
sourmash_databases:
- tests/test-data/GCF_000020205.1.sig.zip

prevent_sra_download: true

skip_genomes: []
70 changes: 70 additions & 0 deletions tests/test_snakemake_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,73 @@ def test_gather_reads_nomatches():

assert "DB is tests/test-data/SRR5950647-genomes/acidulo.zip" in pinfo.stdout
assert "** ERROR: prefetch didn't find anything for sample 'SRR5950647_subset'." in pinfo.stdout


def test_missing_genbank_genome_fail():
# run download_genbank_genomes for a genome that is no longer there; fail.
global _tempdir

conf = utils.relative_file('tests/test-data/conf-missing.yml')
extra_args = ["download_genbank_genomes"]

sigs_dir = os.path.join(_tempdir, "sigs")
try:
os.mkdir(sigs_dir)
except FileExistsError:
pass

src = utils.relative_file("tests/test-data/GCF_000020205-is-missing.trim.sig.zip")
shutil.copy(src, sigs_dir)

pinfo = run_snakemake(
conf,
no_use_conda=True,
verbose=True,
outdir=_tempdir,
extra_args=extra_args,
subprocess_args=dict(text=True, capture_output=True)
)
print('STDOUT')
print(pinfo.stdout)
print('STDERR')
print(pinfo.stderr)

assert "Cannot download genome from URL:" in pinfo.stderr
assert "Is it missing? If so, consider adding 'GCF_000020205.1' to 'skip_genomes' list in config file." in pinfo.stderr

assert pinfo.returncode != 0


def test_missing_genbank_genome_skip():
# run download_genbank_genomes for a genome that is no longer there; skip.
global _tempdir

conf = utils.relative_file('tests/test-data/conf-missing-skip.yml')
extra_args = ["download_genbank_genomes"]

sigs_dir = os.path.join(_tempdir, "sigs")
try:
os.mkdir(sigs_dir)
except FileExistsError:
pass

src = utils.relative_file("tests/test-data/GCF_000020205-is-missing.trim.sig.zip")
shutil.copy(src, sigs_dir)

pinfo = run_snakemake(
conf,
no_use_conda=True,
verbose=True,
outdir=_tempdir,
extra_args=extra_args,
subprocess_args=dict(text=True, capture_output=True)
)
print('STDOUT')
print(pinfo.stdout)
print('STDERR')
print(pinfo.stderr)

assert "Cannot download genome from URL:" not in pinfo.stderr
assert "Is it missing? If so, consider adding 'GCF_000020205.1' to 'skip_genomes' list in config file." not in pinfo.stderr

assert pinfo.returncode == 0

0 comments on commit fc3dbeb

Please sign in to comment.