From e13c60a43298e27bc9fe457a8bfd503c9a1f3c50 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Wed, 29 Nov 2023 17:14:48 +0000 Subject: [PATCH 1/4] Created a Github Workflow that checks files with black upon pushing them to the repo --- .github/workflows/presubmit.yml | 12 ++++++++++++ check.sh | 27 +++++++++++++++++++++++++++ requirements-dev.txt | 1 + 3 files changed, 40 insertions(+) create mode 100644 .github/workflows/presubmit.yml create mode 100644 check.sh create mode 100644 requirements-dev.txt diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml new file mode 100644 index 00000000..4f88a650 --- /dev/null +++ b/.github/workflows/presubmit.yml @@ -0,0 +1,12 @@ +name: Presubmit +on: [push] +jobs: + Presubmit: + runs-on: ubuntu-latest + steps: + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + - uses: actions/checkout@v3 + - run: python3 -m pip install -r requirements-dev.txt + - run: ./check.sh diff --git a/check.sh b/check.sh new file mode 100644 index 00000000..2b9962a5 --- /dev/null +++ b/check.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +CHANGED=0 +if [[ $# = 1 ]] && [[ "$1" = "--fix" ]]; then + if ! black --check . &> /dev/null; then + black . || exit 1 + CHANGED=1 + fi + + if ! isort --check . &> /dev/null; then + isort . || exit 1 + CHANGED=1 + fi +elif [[ $# -gt 0 ]]; then + echo "Usage: $0 [--fix]" > /dev/stderr + exit 1 +fi + +# if this fails with "black: command not found" you need to install black +# python3 -m pip install black +echo Running black to check formatting... +if ! black --check --diff --quiet .; then + echo "FAIL: formatting" + exit 1 +fi + + diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 00000000..7e66a17d --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1 @@ +black From 36638f57a86d1892dd4bb627bfdf85003cee3028 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Wed, 29 Nov 2023 17:25:57 +0000 Subject: [PATCH 2/4] Made check.sh executable. --- check.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 check.sh diff --git a/check.sh b/check.sh old mode 100644 new mode 100755 From 38a8de7d429cdcf876880361cc00f5f743ed74c2 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Wed, 29 Nov 2023 20:54:07 +0000 Subject: [PATCH 3/4] added black settings to always run with line length 79 --- pyproject.toml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..d84cc51b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,6 @@ +[tool.black] +line-length = 79 + +[tool.isort] +profile = "black" +line_length = 79 From 96721a63f74209e152e88d963c7dbe596dead777 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Wed, 29 Nov 2023 20:57:02 +0000 Subject: [PATCH 4/4] reran with black using line length 79 --- .../PRJEB13833/metadata/prepare-metadata.py | 12 +- .../PRJEB49260/metadata/prepare_metadata.py | 4 +- .../PRJNA729801/metadata/parse_metadata.py | 4 +- .../PRJNA812772/metadata/prepare_metadata.py | 12 +- build_bowtie2_db.py | 8 +- count_clades.py | 4 +- dashboard/determine_comparison_species.py | 4 +- dashboard/determine_key_clades.py | 4 +- dashboard/prepare-dashboard-data.py | 37 +++-- dashboard/sample_metadata_classifier.py | 26 +++- expand-human-viruses.py | 8 +- papers/Brinch2020/prepare_metadata.py | 8 +- papers/Munk2022/prepare-metadata.py | 4 +- pipeline-operation/screen-summary.py | 4 +- reprocess-bioprojects.py | 7 +- run.py | 136 +++++++++++++----- 16 files changed, 211 insertions(+), 71 deletions(-) diff --git a/bioprojects/PRJEB13833/metadata/prepare-metadata.py b/bioprojects/PRJEB13833/metadata/prepare-metadata.py index 57e8b3e8..aa04a769 100755 --- a/bioprojects/PRJEB13833/metadata/prepare-metadata.py +++ b/bioprojects/PRJEB13833/metadata/prepare-metadata.py @@ -6,12 +6,16 @@ if line.startswith("run"): continue - run_accession, sample_accession, sample_alias, sample_title = line.split( - "\t" - ) + ( + run_accession, + sample_accession, + sample_alias, + sample_title, + ) = line.split("\t") _, yyyy, mm, dd, site = sample_title.split("_") outf.write( - "%s\t%s-%s-%s\tCluster %s\n" % (run_accession, yyyy, mm, dd, site) + "%s\t%s-%s-%s\tCluster %s\n" + % (run_accession, yyyy, mm, dd, site) ) diff --git a/bioprojects/PRJEB49260/metadata/prepare_metadata.py b/bioprojects/PRJEB49260/metadata/prepare_metadata.py index f222cce8..f2b5d5b7 100755 --- a/bioprojects/PRJEB49260/metadata/prepare_metadata.py +++ b/bioprojects/PRJEB49260/metadata/prepare_metadata.py @@ -26,4 +26,6 @@ dd = date[2:4] yy = date[4:6] - outf.write("%s\t20%s-%s-%s\t%s\n" % (run_accession, yy, mm, dd, location)) + outf.write( + "%s\t20%s-%s-%s\t%s\n" % (run_accession, yy, mm, dd, location) + ) diff --git a/bioprojects/PRJNA729801/metadata/parse_metadata.py b/bioprojects/PRJNA729801/metadata/parse_metadata.py index 86f177f9..f24a7c9f 100755 --- a/bioprojects/PRJNA729801/metadata/parse_metadata.py +++ b/bioprojects/PRJNA729801/metadata/parse_metadata.py @@ -39,7 +39,9 @@ def start(raw_metadata_in, parsed_metadata_out): data.sort() with open(parsed_metadata_out, "w") as outf: - outf.write("\t".join(["filename", "date", "plant", "is_enriched"]) + "\n") + outf.write( + "\t".join(["filename", "date", "plant", "is_enriched"]) + "\n" + ) for plant, date, filename, is_enriched in data: outf.write("\t".join([filename, date, plant, is_enriched]) + "\n") diff --git a/bioprojects/PRJNA812772/metadata/prepare_metadata.py b/bioprojects/PRJNA812772/metadata/prepare_metadata.py index d1ccbe43..78e4d64a 100755 --- a/bioprojects/PRJNA812772/metadata/prepare_metadata.py +++ b/bioprojects/PRJNA812772/metadata/prepare_metadata.py @@ -16,11 +16,17 @@ with open("raw_metadata.tsv") as inf: with open("metadata.tsv", "w") as outf: for line in inf: - sample_accession, run_accession, sample_alias = line.strip().split("\t") + sample_accession, run_accession, sample_alias = line.strip().split( + "\t" + ) _, strategy = sample_alias.split("_") if strategy == "sarscov2": continue - collection_date = sample_accession_to_collection_date[sample_accession] - outf.write("%s\t%s\t%s\n" % (run_accession, strategy, collection_date)) + collection_date = sample_accession_to_collection_date[ + sample_accession + ] + outf.write( + "%s\t%s\t%s\n" % (run_accession, strategy, collection_date) + ) diff --git a/build_bowtie2_db.py b/build_bowtie2_db.py index 1eb11abd..22bf9f3b 100755 --- a/build_bowtie2_db.py +++ b/build_bowtie2_db.py @@ -116,7 +116,9 @@ def combine_genomes(combined_genomes_fname): outf.writelines(inf.readlines()) -def mask_low_complexity_sequences(combined_genomes_fname, masked_genomes_fname): +def mask_low_complexity_sequences( + combined_genomes_fname, masked_genomes_fname +): if os.path.exists(masked_genomes_fname): return print("Masking low complexity sequences...") @@ -149,7 +151,9 @@ def mask_low_complexity_sequences(combined_genomes_fname, masked_genomes_fname): # # This regexp replaces all lowercase letters that aren't on lines beginning # with '>', which in FASTA means everywhere except in the sequence IDs. - subprocess.check_call(["sed", "/^>/!s/[a-z]/x/g", "-i", masked_genomes_fname]) + subprocess.check_call( + ["sed", "/^>/!s/[a-z]/x/g", "-i", masked_genomes_fname] + ) def build_db(bowtie_db_prefix, genomes_fname): diff --git a/count_clades.py b/count_clades.py index 16046380..94c5b4a5 100755 --- a/count_clades.py +++ b/count_clades.py @@ -10,7 +10,9 @@ parents = {} # child_taxid -> parent_taxid with open("dashboard/nodes.dmp") as inf: for line in inf: - child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split("\t|\t") + child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split( + "\t|\t" + ) child_taxid = int(child_taxid) parent_taxid = int(parent_taxid) parents[child_taxid] = parent_taxid diff --git a/dashboard/determine_comparison_species.py b/dashboard/determine_comparison_species.py index 953a717e..85959c60 100755 --- a/dashboard/determine_comparison_species.py +++ b/dashboard/determine_comparison_species.py @@ -8,7 +8,9 @@ children = defaultdict(list) # parent_taxid -> [children] with open("nodes.dmp") as inf: for line in inf: - child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split("\t|\t") + child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split( + "\t|\t" + ) child_taxid = int(child_taxid) parent_taxid = int(parent_taxid) if child_taxid != parent_taxid: diff --git a/dashboard/determine_key_clades.py b/dashboard/determine_key_clades.py index 9ab9ced1..22fb412c 100755 --- a/dashboard/determine_key_clades.py +++ b/dashboard/determine_key_clades.py @@ -10,7 +10,9 @@ children = defaultdict(set) # parent_taxid -> [children] with open("nodes.dmp") as inf: for line in inf: - child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split("\t|\t") + child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split( + "\t|\t" + ) child_taxid = int(child_taxid) parent_taxid = int(parent_taxid) if child_taxid != parent_taxid: diff --git a/dashboard/prepare-dashboard-data.py b/dashboard/prepare-dashboard-data.py index e3827c30..f81ca1f3 100755 --- a/dashboard/prepare-dashboard-data.py +++ b/dashboard/prepare-dashboard-data.py @@ -39,7 +39,9 @@ parents = {} # child_taxid -> parent_taxid with open("%s/nodes.dmp" % DASHBOARD_DIR) as inf: for line in inf: - child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split("\t|\t") + child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split( + "\t|\t" + ) child_taxid = int(child_taxid) parent_taxid = int(parent_taxid) parents[child_taxid] = parent_taxid @@ -51,7 +53,9 @@ # project -> sample -> n_reads project_sample_reads = defaultdict(dict) -for metadata_fname in glob.glob("%s/bioprojects/*/metadata/metadata.tsv" % ROOT_DIR): +for metadata_fname in glob.glob( + "%s/bioprojects/*/metadata/metadata.tsv" % ROOT_DIR +): project = metadata_fname.split("/")[-3] if project in ["PRJEB30546", "PRJNA691135"]: # didn't finish importing this one, and the dashboard chokes on papers @@ -137,7 +141,9 @@ # paper -> {link, samples, projects, na_type, subset} papers = {} for project in projects: - with open("%s/bioprojects/%s/metadata/name.txt" % (ROOT_DIR, project)) as inf: + with open( + "%s/bioprojects/%s/metadata/name.txt" % (ROOT_DIR, project) + ) as inf: paper_name = inf.read().strip() if paper_name not in papers: papers[paper_name] = {} @@ -169,7 +175,8 @@ def rc(s): return "".join( - {"T": "A", "G": "C", "A": "T", "C": "G", "N": "N"}[x] for x in reversed(s) + {"T": "A", "G": "C", "A": "T", "C": "G", "N": "N"}[x] + for x in reversed(s) ) @@ -289,7 +296,9 @@ def count_dups(hvr_fname): taxonomic_names = defaultdict(list) with open("%s/names.dmp" % DASHBOARD_DIR) as inf: for line in inf: - taxid, name, unique_name, name_class = line.replace("\t|\n", "").split("\t|\t") + taxid, name, unique_name, name_class = line.replace("\t|\n", "").split( + "\t|\t" + ) taxid = int(taxid) if taxid in mentioned_taxids or taxid in comparison_sample_counts: @@ -305,19 +314,26 @@ def count_dups(hvr_fname): sample_metadata = defaultdict(dict) for project in projects: - with open("%s/bioprojects/%s/metadata/metadata.tsv" % (ROOT_DIR, project)) as inf: + with open( + "%s/bioprojects/%s/metadata/metadata.tsv" % (ROOT_DIR, project) + ) as inf: for line in inf: if not line.strip(): continue line = line[:-1] # drop trailing newline - sample, sample_metadata_dict = sample_metadata_classifier.interpret( + ( + sample, + sample_metadata_dict, + ) = sample_metadata_classifier.interpret( project, papers, line.split("\t") ) sample_metadata[sample] = sample_metadata_dict for sample in project_sample_reads[project]: - sample_metadata[sample]["reads"] = project_sample_reads[project][sample] + sample_metadata[sample]["reads"] = project_sample_reads[project][ + sample + ] rf_fname = "ribofrac/%s.ribofrac.txt" % sample try: @@ -349,7 +365,10 @@ def count_dups(hvr_fname): ]: with open(DASHBOARD_DIR + name + ".json", "w") as outf: json.dump( - val, outf, sort_keys=True, indent=None if val is human_virus_tree else 2 + val, + outf, + sort_keys=True, + indent=None if val is human_virus_tree else 2, ) # To make the dashboard load faster, divide counts by bioproject and don't load diff --git a/dashboard/sample_metadata_classifier.py b/dashboard/sample_metadata_classifier.py index 37678464..26516875 100644 --- a/dashboard/sample_metadata_classifier.py +++ b/dashboard/sample_metadata_classifier.py @@ -61,12 +61,18 @@ def interpret(project, papers, bits): elif project in papers["Bengtsson-Palme 2016"]["projects"]: sample, location, site = bits return sample, dict( - date="2012-09", country="Sweden", location=location, fine_location=site + date="2012-09", + country="Sweden", + location=location, + fine_location=site, ) elif project in papers["Brinch 2020"]["projects"]: sample, loc, date = bits return sample, dict( - date=date, country="Denmark", location="Copenhagen", fine_location=loc + date=date, + country="Denmark", + location="Copenhagen", + fine_location=loc, ) elif project in papers["Spurbeck 2023"]["projects"]: sample, loc, date = bits @@ -158,7 +164,10 @@ def interpret(project, papers, bits): elif project in papers["Hendriksen 2019"]["projects"]: sample, date, cluster = bits return sample, dict( - country="Kenya", location="Kibera", fine_location=cluster, date=date + country="Kenya", + location="Kibera", + fine_location=cluster, + date=date, ) elif project in papers["Yang 2020"]["projects"]: sample, city = bits @@ -168,7 +177,10 @@ def interpret(project, papers, bits): elif project in papers["Wang 2022"]["projects"]: sample, date, hospital = bits return sample, dict( - country="Saudi Arabia", location="Jeddah", date=date, fine_location=hospital + country="Saudi Arabia", + location="Jeddah", + date=date, + fine_location=hospital, ) elif project in papers["Cui 2023"]["projects"]: (sample,) = bits @@ -259,7 +271,11 @@ def interpret(project, papers, bits): sample, _, enrichment, loc, city_state, date, flow = bits city, state = city_state.split(", ") record = dict( - country="United States", city=city, state="Texas", location=loc, date=date + country="United States", + city=city, + state="Texas", + location=loc, + date=date, ) if enrichment == "1": record["enrichment"] = "panel" diff --git a/expand-human-viruses.py b/expand-human-viruses.py index a7393eb5..9fa60217 100755 --- a/expand-human-viruses.py +++ b/expand-human-viruses.py @@ -16,7 +16,9 @@ children = {} with open("dashboard/nodes.dmp") as inf: for line in inf: - child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split("\t|\t") + child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split( + "\t|\t" + ) child_taxid = int(child_taxid) parent_taxid = int(parent_taxid) @@ -42,7 +44,9 @@ def add_children(taxid): taxonomic_names = {} with open("dashboard/names.dmp") as inf: for line in inf: - taxid, name, unique_name, name_class = line.replace("\t|\n", "").split("\t|\t") + taxid, name, unique_name, name_class = line.replace("\t|\n", "").split( + "\t|\t" + ) taxid = int(taxid) if taxid in hv: diff --git a/papers/Brinch2020/prepare_metadata.py b/papers/Brinch2020/prepare_metadata.py index 31d6047a..298b032b 100755 --- a/papers/Brinch2020/prepare_metadata.py +++ b/papers/Brinch2020/prepare_metadata.py @@ -16,8 +16,12 @@ sample_to_details[bits[7]] = bits[15], bits[17] for project in ["PRJEB34633", "PRJEB13832"]: - with open("../../bioprojects/%s/metadata/metadata_raw.tsv" % project) as inf: - with open("../../bioprojects/%s/metadata/metadata.tsv" % project, "w") as outf: + with open( + "../../bioprojects/%s/metadata/metadata_raw.tsv" % project + ) as inf: + with open( + "../../bioprojects/%s/metadata/metadata.tsv" % project, "w" + ) as outf: for line in inf: sample = line.strip() if sample not in sample_to_details: diff --git a/papers/Munk2022/prepare-metadata.py b/papers/Munk2022/prepare-metadata.py index 1a6b3374..da8aabe6 100755 --- a/papers/Munk2022/prepare-metadata.py +++ b/papers/Munk2022/prepare-metadata.py @@ -115,7 +115,9 @@ def clean_date(x): bioproject_dir = os.path.join(root, "bioprojects") for bioproject in os.listdir(bioproject_dir): - with open(os.path.join(bioproject_dir, bioproject, "metadata", "name.txt")) as inf: + with open( + os.path.join(bioproject_dir, bioproject, "metadata", "name.txt") + ) as inf: if inf.read().strip() != "Munk 2022": continue diff --git a/pipeline-operation/screen-summary.py b/pipeline-operation/screen-summary.py index 438c9192..def54009 100755 --- a/pipeline-operation/screen-summary.py +++ b/pipeline-operation/screen-summary.py @@ -41,7 +41,9 @@ def start(): with tempfile.TemporaryDirectory() as workdir: tmpfname = os.path.join(workdir, "tmp.txt") - subprocess.check_call(["screen", "-S", screen, "-X", "hardcopy", tmpfname]) + subprocess.check_call( + ["screen", "-S", screen, "-X", "hardcopy", tmpfname] + ) # wait for screen to dump like we asked while not os.path.exists(tmpfname): diff --git a/reprocess-bioprojects.py b/reprocess-bioprojects.py index 246a1ef4..03308738 100755 --- a/reprocess-bioprojects.py +++ b/reprocess-bioprojects.py @@ -28,7 +28,9 @@ restricted_bioprojects = [] restricted_dir = os.path.join("..", "mgs-restricted") if os.path.exists(restricted_dir): - restricted_bioprojects = os.listdir(os.path.join(restricted_dir, "bioprojects")) + restricted_bioprojects = os.listdir( + os.path.join(restricted_dir, "bioprojects") + ) def prepare_job(bioproject, log_prefix, run_args): @@ -85,7 +87,8 @@ def start(): help="Log prefix, for storing this run under log/", ) parser.add_argument( - "--bioprojects", help="The IDs of the bioproject to process, comma separated" + "--bioprojects", + help="The IDs of the bioproject to process, comma separated", ) args = parser.parse_args(our_args) diff --git a/run.py b/run.py index 4f6bc98b..343d7f9f 100755 --- a/run.py +++ b/run.py @@ -40,7 +40,9 @@ def check_call_shell(cmd): def check_output_shell(cmd): # Unlike subprocess.check_output, if any member of the pipeline fails then # this fails too. - return subprocess.check_output(["bash", "-c", "set -o pipefail; %s" % cmd]) + return subprocess.check_output( + ["bash", "-c", "set -o pipefail; %s" % cmd] + ) def work_fname(*fnames): @@ -71,7 +73,9 @@ def tempdir(stage, msg): def exists_s3_prefix(s3_path): try: - subprocess.check_call(["aws", "s3", "ls", s3_path], stdout=subprocess.DEVNULL) + subprocess.check_call( + ["aws", "s3", "ls", s3_path], stdout=subprocess.DEVNULL + ) return True # exit code 0 if present except subprocess.CalledProcessError as e: if e.returncode == 1: @@ -129,10 +133,15 @@ def get_adapters(in1, in2, adapter1_fname, adapter2_fname): elif "--adapter2:" in line: adapter2 = line.replace("--adapter2:", "").strip() - for adapter, fname in [[adapter1, adapter1_fname], [adapter2, adapter2_fname]]: + for adapter, fname in [ + [adapter1, adapter1_fname], + [adapter2, adapter2_fname], + ]: if not all(x in "ACTGN" for x in adapter) or len(adapter) < 20: print(output) - raise Exception("Invalid adapter %r for %r and %r" % (adapter, in1, in2)) + raise Exception( + "Invalid adapter %r for %r and %r" % (adapter, in1, in2) + ) with open(fname, "w") as outf: outf.write(adapter) @@ -189,7 +198,9 @@ def adapter_removal(args, dirname, trim_quality, collapse): adapter1_fname = os.path.join(adapter_dir, "%s.fwd" % sample) adapter2_fname = os.path.join(adapter_dir, "%s.rev" % sample) - if not os.path.exists(adapter1_fname) or not os.path.exists(adapter2_fname): + if not os.path.exists(adapter1_fname) or not os.path.exists( + adapter2_fname + ): get_adapters(in1, in2, adapter1_fname, adapter2_fname) with open(adapter1_fname) as inf: @@ -266,7 +277,9 @@ def ribofrac(args, subset_size=1000): def first_subset_fastq(file_paths, subset_size): """Selects the first subset of reads from gzipped fastq files""" - print(f"Counting reads in input and selecting the first {subset_size}...") + print( + f"Counting reads in input and selecting the first {subset_size}..." + ) output_files = [] total_reads = 0 # Count the total number of reads only for the first input file @@ -281,11 +294,15 @@ def first_subset_fastq(file_paths, subset_size): # Create an output file handle output_file = fp.replace(".fq.gz", ".subset.fq") with open(output_file, "w") as out_handle: - for index, (title, seq, qual) in enumerate(FastqGeneralIterator(f)): + for index, (title, seq, qual) in enumerate( + FastqGeneralIterator(f) + ): if index >= subset_size: break actual_subset_size += 1 - out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) + out_handle.write( + "@%s\n%s\n+\n%s\n" % (title, seq, qual) + ) output_files.append(output_file) @@ -333,12 +350,16 @@ def file_integrity_check(filename): total_files_in_sample += 1 # Number of output and input files must match - tmp_fq_output = potential_input.replace(".gz", ".subset.nonrrna.fq") + tmp_fq_output = potential_input.replace( + ".gz", ".subset.nonrrna.fq" + ) tmp_fq_outputs = [tmp_fq_output] inputs = [potential_input] if ".pair1." in potential_input: - tmp_fq_outputs.append(tmp_fq_output.replace(".pair1.", ".pair2.")) + tmp_fq_outputs.append( + tmp_fq_output.replace(".pair1.", ".pair2.") + ) inputs.append(potential_input.replace(".pair1.", ".pair2.")) elif ".pair2." in potential_input: # Ribodetector handles pair1 and pair2 together. @@ -363,7 +384,9 @@ def file_integrity_check(filename): empty_files_in_sample += 1 # Ribodetector gets angry if the .fq extension isn't in the filename - os.rename(input_fname, input_fname.replace(".gz", ".fq.gz")) + os.rename( + input_fname, input_fname.replace(".gz", ".fq.gz") + ) if total_files_in_sample == empty_files_for_sample: print(f"Skipping {sample}... all files are empty.") @@ -387,7 +410,9 @@ def calculate_average_read_length(file_path): total_len = 0 total_reads = 0 with open(file_path, "rt") as inf: - for title, sequence, quality in FastqGeneralIterator(inf): + for title, sequence, quality in FastqGeneralIterator( + inf + ): total_len += len(sequence) total_reads += 1 return round(total_len / total_reads) @@ -416,7 +441,10 @@ def calculate_average_read_length(file_path): # Count number of rRNA reads in subset non_rrna_count = sum( - 1 for _ in FastqGeneralIterator(open(tmp_fq_outputs[0], "rt")) + 1 + for _ in FastqGeneralIterator( + open(tmp_fq_outputs[0], "rt") + ) ) rrna_reads_dict[inputs[0]] = subset_reads - non_rrna_count @@ -429,7 +457,9 @@ def calculate_average_read_length(file_path): # Use the total number of reads for each input as weights weights = list(total_reads_dict.values()) - weighted_rrna_fraction = np.average(fractions_rrna_in_subset, weights=weights) + weighted_rrna_fraction = np.average( + fractions_rrna_in_subset, weights=weights + ) fraction_rrna = round(weighted_rrna_fraction, 4) print( @@ -488,7 +518,9 @@ def riboreads(args): inputs = [potential_input] if ".pair1." in potential_input: - tmp_fq_outputs.append(tmp_fq_output.replace(".pair1.", ".pair2.")) + tmp_fq_outputs.append( + tmp_fq_output.replace(".pair1.", ".pair2.") + ) inputs.append(potential_input.replace(".pair1.", ".pair2.")) elif ".pair2." in potential_input: # Ribodetector handles pair1 and pair2 together. @@ -508,7 +540,9 @@ def riboreads(args): ) # Ribodetector gets angry if the .fq extension isn't in the filename - os.rename(input_fname, input_fname.replace(".gz", ".fq.gz")) + os.rename( + input_fname, input_fname.replace(".gz", ".fq.gz") + ) # Add .fq extensions to input files inputs = [i.replace(".gz", ".fq.gz") for i in inputs] @@ -521,7 +555,9 @@ def calculate_average_read_length(file_path): total_len = 0 total_reads = 0 with gzip.open(file_path, "rt") as inf: - for title, sequence, quality in FastqGeneralIterator(inf): + for title, sequence, quality in FastqGeneralIterator( + inf + ): total_len += len(sequence) total_reads += 1 return round(total_len / total_reads) @@ -565,7 +601,9 @@ def parse_reads(file_path): mode = "r" with open_func(file_path, mode) as inf: - for title, sequence, quality in FastqGeneralIterator(inf): + for title, sequence, quality in FastqGeneralIterator( + inf + ): seq_titles.append(title) return seq_titles @@ -692,7 +730,9 @@ def cladecounts(args): if not any(x.startswith(sample) for x in available_inputs): continue - subprocess.check_call(["./count_clades.sh", S3_BUCKET, args.bioproject, sample]) + subprocess.check_call( + ["./count_clades.sh", S3_BUCKET, args.bioproject, sample] + ) SAMPLE_READS_TARGET_LEN = 100_000 @@ -708,7 +748,9 @@ def samplereads(args): parents = {} with open(os.path.join(THISDIR, "dashboard", "nodes.dmp")) as inf: for line in inf: - child_taxid, parent_taxid, *_ = line.replace("\t|\n", "").split("\t|\t") + child_taxid, parent_taxid, *_ = line.replace("\t|\n", "").split( + "\t|\t" + ) parents[int(child_taxid)] = int(parent_taxid) def taxid_under(clade, taxid): @@ -799,13 +841,17 @@ def taxid_matches(taxid, category): // full_count ) - subsetted_ids[category].extend(read_ids[fname][category][:target]) + subsetted_ids[category].extend( + read_ids[fname][category][:target] + ) with tempdir("samplereads", sample) as workdir: with gzip.open(output, "wt") as outf: for category in sorted(subsetted_ids): for selected_read_id in sorted(subsetted_ids[category]): - outf.write("%s\t%s\n" % (category[0], selected_read_id)) + outf.write( + "%s\t%s\n" % (category[0], selected_read_id) + ) subprocess.check_call( [ @@ -828,7 +874,9 @@ def readlengths(args): if output in existing_outputs: continue - inputs = [x for x in available_samplereads_inputs if x.startswith(sample)] + inputs = [ + x for x in available_samplereads_inputs if x.startswith(sample) + ] if not any(inputs): continue (fname,) = inputs @@ -968,7 +1016,9 @@ def humanviruses(args): with tempdir("humanviruses", sample) as workdir: with open(output, "w") as outf: for taxid, count in sorted(counts.items()): - outf.write("%s\t%s\t%s\n" % (taxid, count, human_viruses[taxid])) + outf.write( + "%s\t%s\t%s\n" % (taxid, count, human_viruses[taxid]) + ) subprocess.check_call( [ @@ -976,7 +1026,8 @@ def humanviruses(args): "s3", "cp", output, - "%s/%s/humanviruses/%s" % (S3_BUCKET, args.bioproject, output), + "%s/%s/humanviruses/%s" + % (S3_BUCKET, args.bioproject, output), ] ) @@ -1048,7 +1099,8 @@ def allmatches(args): "s3", "cp", output, - "%s/%s/allmatches/%s" % (S3_BUCKET, args.bioproject, output), + "%s/%s/allmatches/%s" + % (S3_BUCKET, args.bioproject, output), ] ) @@ -1085,7 +1137,8 @@ def hvreads(args): "aws", "s3", "cp", - "%s/%s/allmatches/%s" % (S3_BUCKET, args.bioproject, input_fname), + "%s/%s/allmatches/%s" + % (S3_BUCKET, args.bioproject, input_fname), "-", ] ) @@ -1149,7 +1202,9 @@ def alignments(args): existing_outputs = get_files(args, "alignments", min_date="2023-11-28") - with open(os.path.join(THISDIR, "bowtie", "genomeid-to-taxid.json")) as inf: + with open( + os.path.join(THISDIR, "bowtie", "genomeid-to-taxid.json") + ) as inf: genomeid_to_taxid = json.load(inf) for sample in get_samples(args): @@ -1171,7 +1226,9 @@ def alignments(args): inputs = [potential_input] if ".pair1." in output: output = output.replace(".pair1.", ".") - inputs.append(potential_input.replace(".pair1.", ".pair2.")) + inputs.append( + potential_input.replace(".pair1.", ".pair2.") + ) elif ".pair2" in output: # We handle pair1 and pair2 together. continue @@ -1182,7 +1239,8 @@ def alignments(args): # short? I remember Bowtie choking on these before. full_inputs = [ - "%s/%s/cleaned/%s" % (S3_BUCKET, args.bioproject, input_fname) + "%s/%s/cleaned/%s" + % (S3_BUCKET, args.bioproject, input_fname) for input_fname in inputs ] @@ -1263,7 +1321,8 @@ def print_status(args): bioprojects = [args.bioproject] else: bioprojects = [ - os.path.basename(x) for x in glob.glob(work_fname("bioprojects", "*")) + os.path.basename(x) + for x in glob.glob(work_fname("bioprojects", "*")) ] running_processes = subprocess.check_output(["ps", "aux"]).decode("utf-8") @@ -1396,10 +1455,14 @@ def start(): ) parser.add_argument( - "--restricted", action="store_true", help="Whether to work on private data" + "--restricted", + action="store_true", + help="Whether to work on private data", ) - parser.add_argument("--bioproject", help="The ID of the bioproject to process") + parser.add_argument( + "--bioproject", help="The ID of the bioproject to process" + ) parser.add_argument( "--sample", default="", @@ -1421,7 +1484,9 @@ def start(): ) parser.add_argument( - "--skip-stages", default="", help="Comma-separated list of stages not to run." + "--skip-stages", + default="", + help="Comma-separated list of stages not to run.", ) args = parser.parse_args() @@ -1445,7 +1510,8 @@ def start(): if not os.path.isdir(work_fname("bioprojects", args.bioproject)): raise Exception( - "Bioproject %s not found in %s/bioprojects" % (args.bioproject, WORK_ROOT) + "Bioproject %s not found in %s/bioprojects" + % (args.bioproject, WORK_ROOT) ) selected_stages = args.stages.split(",")