diff --git a/.github/workflows/python_unitests.yml b/.github/workflows/python_unitests.yml index d094a4ec..6cdf925b 100644 --- a/.github/workflows/python_unitests.yml +++ b/.github/workflows/python_unitests.yml @@ -20,4 +20,4 @@ jobs: pip install -U -r requirements-test.txt - name: Testing run: | - PYTHONPATH="$PYTHONPATH:bin:helpers/database-import-scripts/uniprot" pytest + pytest diff --git a/helpers/database-import-scripts/rnacentral/calculate_model_lengths.py b/helpers/database_import_scripts/rnacentral/calculate_model_lengths.py similarity index 55% rename from helpers/database-import-scripts/rnacentral/calculate_model_lengths.py rename to helpers/database_import_scripts/rnacentral/calculate_model_lengths.py index 581af04d..d3bd75c8 100644 --- a/helpers/database-import-scripts/rnacentral/calculate_model_lengths.py +++ b/helpers/database_import_scripts/rnacentral/calculate_model_lengths.py @@ -20,7 +20,10 @@ def main(rfam_file, output): if rfam_lengths[name] == model_length: pass else: - print("ERROR: same name occurs multiple times and lengths do not match", name) + print( + "ERROR: same name occurs multiple times and lengths do not match", + name, + ) else: rfam_lengths[name] = model_length name = "" @@ -30,17 +33,27 @@ def main(rfam_file, output): def parse_args(): - parser = argparse.ArgumentParser(description="Script produces a file with lengths of Rfam covariance models." - "The resulting file is necessary for running the JSON generating" - "script and only needs to be produced once unless the Rfam.cm" - "file is updated.") - parser.add_argument('-r', '--rfam-file', required=True, - help='Path to the Rfam.cm file containing concatinated Rfam models.') - parser.add_argument('-o', '--output', required=True, - help='Path to the output file where the lengths of models will be saved to.') + parser = argparse.ArgumentParser( + description="Script produces a file with lengths of Rfam covariance models." + "The resulting file is necessary for running the JSON generating" + "script and only needs to be produced once unless the Rfam.cm" + "file is updated." + ) + parser.add_argument( + "-r", + "--rfam-file", + required=True, + help="Path to the Rfam.cm file containing concatinated Rfam models.", + ) + parser.add_argument( + "-o", + "--output", + required=True, + help="Path to the output file where the lengths of models will be saved to.", + ) return parser.parse_args() -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() main(args.rfam_file, args.output) diff --git a/helpers/database-import-scripts/rnacentral/generate_rnacentral_json.py b/helpers/database_import_scripts/rnacentral/generate_rnacentral_json.py similarity index 100% rename from helpers/database-import-scripts/rnacentral/generate_rnacentral_json.py rename to helpers/database_import_scripts/rnacentral/generate_rnacentral_json.py diff --git a/helpers/database-import-scripts/rnacentral/rfam_model_lengths.txt b/helpers/database_import_scripts/rnacentral/rfam_model_lengths.txt similarity index 100% rename from helpers/database-import-scripts/rnacentral/rfam_model_lengths.txt rename to helpers/database_import_scripts/rnacentral/rfam_model_lengths.txt diff --git a/helpers/database-import-scripts/rnacentral/rfam_model_lengths_14.9.txt b/helpers/database_import_scripts/rnacentral/rfam_model_lengths_14.9.txt similarity index 100% rename from helpers/database-import-scripts/rnacentral/rfam_model_lengths_14.9.txt rename to helpers/database_import_scripts/rnacentral/rfam_model_lengths_14.9.txt diff --git a/helpers/database_import_scripts/uniprot/__init__.py b/helpers/database_import_scripts/uniprot/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/helpers/database-import-scripts/uniprot/convert_gbk.py b/helpers/database_import_scripts/uniprot/convert_gbk.py similarity index 100% rename from helpers/database-import-scripts/uniprot/convert_gbk.py rename to helpers/database_import_scripts/uniprot/convert_gbk.py diff --git a/helpers/database-import-scripts/uniprot/generate_uniprot_metadata.py b/helpers/database_import_scripts/uniprot/generate_uniprot_metadata.py similarity index 100% rename from helpers/database-import-scripts/uniprot/generate_uniprot_metadata.py rename to helpers/database_import_scripts/uniprot/generate_uniprot_metadata.py diff --git a/helpers/database-import-scripts/uniprot/gtdb_to_ncbi_majority_vote.py b/helpers/database_import_scripts/uniprot/gtdb_to_ncbi_majority_vote.py similarity index 100% rename from helpers/database-import-scripts/uniprot/gtdb_to_ncbi_majority_vote.py rename to helpers/database_import_scripts/uniprot/gtdb_to_ncbi_majority_vote.py diff --git a/helpers/database-import-scripts/uniprot/gtdb_to_ncbi_majority_vote_v2.py b/helpers/database_import_scripts/uniprot/gtdb_to_ncbi_majority_vote_v2.py similarity index 100% rename from helpers/database-import-scripts/uniprot/gtdb_to_ncbi_majority_vote_v2.py rename to helpers/database_import_scripts/uniprot/gtdb_to_ncbi_majority_vote_v2.py diff --git a/helpers/database-import-scripts/uniprot/preprocess_taxonomy_for_uniprot.py b/helpers/database_import_scripts/uniprot/preprocess_taxonomy_for_uniprot.py similarity index 100% rename from helpers/database-import-scripts/uniprot/preprocess_taxonomy_for_uniprot.py rename to helpers/database_import_scripts/uniprot/preprocess_taxonomy_for_uniprot.py diff --git a/helpers/database-import-scripts/uniprot/uniprot_sanity_check.py b/helpers/database_import_scripts/uniprot/uniprot_sanity_check.py similarity index 100% rename from helpers/database-import-scripts/uniprot/uniprot_sanity_check.py rename to helpers/database_import_scripts/uniprot/uniprot_sanity_check.py diff --git a/helpers/file_organiser.sh b/helpers/file_organiser.sh index 1b161d4a..d0ea13cf 100644 --- a/helpers/file_organiser.sh +++ b/helpers/file_organiser.sh @@ -45,8 +45,8 @@ function GenerateRNACentralJSON { echo "Running JSON generation" mitload miniconda && conda activate pybase - python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database-import-scripts/rnacentral/generate_rnacentral_json.py \ - -r /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database-import-scripts/rnacentral/rfam_model_lengths_14.9.txt \ + python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database_import_scripts/rnacentral/generate_rnacentral_json.py \ + -r /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database_import_scripts/rnacentral/rfam_model_lengths_14.9.txt \ -m ${RESULTS_PATH}/genomes-all_metadata.tsv -o ${RESULTS_PATH}/additional_data/rnacentral/${CATALOGUE_FOLDER}-rnacentral.json \ -d ${RESULTS_PATH}/additional_data/ncrna_deoverlapped_species_reps/ -g ${RESULTS_PATH}/additional_data/rnacentral/GFFs/ \ -f ${RESULTS_PATH}/additional_data/mgyg_genomes/ @@ -121,19 +121,19 @@ function GenerateUniprotFiles { fi mitload miniconda && conda activate pybase - python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database-import-scripts/uniprot/preprocess_taxonomy_for_uniprot.py \ + python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database_import_scripts/uniprot/preprocess_taxonomy_for_uniprot.py \ -g ${RESULTS_PATH}/additional_data/gtdb-tk_output/ -r "r214" -v "2" -o ${RESULTS_PATH}/additional_data/uniprot/preprocessed_taxonomy.tsv echo "Generating Uniprot files" ACCS=$(ls ${RESULTS_PATH}/additional_data/prokka_gbk_species_reps/${F}.gbk | rev | cut -d '/' -f1 | rev | sed "s/\.gbk//") - for F in $ACCS; do python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database-import-scripts/uniprot/convert_gbk.py \ + for F in $ACCS; do python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database_import_scripts/uniprot/convert_gbk.py \ -g ${RESULTS_PATH}/additional_data/prokka_gbk_species_reps/${F}.gbk \ -o ${RESULTS_PATH}/additional_data/uniprot/uniprot-files/${F}_uniprot.gbk \ -t ${RESULTS_PATH}/additional_data/uniprot/preprocessed_taxonomy.tsv; done echo "Generating Uniprot metadata" - python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database-import-scripts/uniprot/generate_uniprot_metadata.py \ + python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database_import_scripts/uniprot/generate_uniprot_metadata.py \ -m ${RESULTS_PATH}/genomes-all_metadata.tsv -o ${RESULTS_PATH}/additional_data/uniprot/${CATALOGUE_FOLDER}_${CATALOGUE_VERSION}_uniprot_metadata.tsv \ -p ${RESULTS_PATH}/additional_data/uniprot/preprocessed_taxonomy.tsv diff --git a/pyproject.toml b/pyproject.toml index ac623f98..4befaf73 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [tool.pytest.ini_options] -pythonpath = [".", "tests"] +pythonpath = [".", "bin", "helpers/database_import_scripts/uniprot", "tests"] python_files = ["test_*.py"] [tool.ruff] @@ -10,4 +10,4 @@ profile = "black" [tool.back] line-length = 110 -experimental-string-processing = true \ No newline at end of file +experimental-string-processing = true diff --git a/tests/scripts/test_preprocess_taxonomy_for_uniprot.py b/tests/scripts/test_preprocess_taxonomy_for_uniprot.py index 35149d14..c00fa35a 100644 --- a/tests/scripts/test_preprocess_taxonomy_for_uniprot.py +++ b/tests/scripts/test_preprocess_taxonomy_for_uniprot.py @@ -1,58 +1,61 @@ -import importlib import unittest -import os - from pathlib import Path -script_dir = Path(__file__).resolve().parent -uniprot_dir = script_dir / '../../helpers/database-import-scripts/uniprot' -uniprot_dir = uniprot_dir.resolve() - -preprocess_taxonomy = importlib.import_module("helpers.database-import-scripts.uniprot.preprocess_taxonomy_for_uniprot") -parse_metadata =preprocess_taxonomy.parse_metadata -match_taxid_to_gca = preprocess_taxonomy.match_taxid_to_gca -get_species_level_taxonomy = preprocess_taxonomy.get_species_level_taxonomy +from helpers.database_import_scripts.uniprot.preprocess_taxonomy_for_uniprot import ( + get_species_level_taxonomy, + match_taxid_to_gca, + parse_metadata, +) class TestPreprocessTaxonomyForUniprot(unittest.TestCase): def setUp(self): - self.metadata_file = os.path.join(script_dir, "fixtures", "preprocess_taxonomy_for_uniprot", - "sample_metadata.tsv") + self.metadata_file = ( + Path(__file__).resolve().parent + / "fixtures/preprocess_taxonomy_for_uniprot/sample_metadata.tsv" + ) self.gca_accessions, self.sample_accessions = parse_metadata(self.metadata_file) - self.mgyg_to_gca, self.gca_to_taxid = match_taxid_to_gca(self.gca_accessions, self.sample_accessions, 2) + self.mgyg_to_gca, self.gca_to_taxid = match_taxid_to_gca( + self.gca_accessions, self.sample_accessions, 2 + ) def test_match_taxid_to_gca(self): - expected_gca_to_taxid = {'GCA_946182765': '370804', - 'GCA_946182785': '244328', - 'GCA_946182825': '297314', - 'GCA_946182815': '370804', - 'GCA_946182805': '190764', - 'GCA_946182835': '370804', - 'GCA_946183145': '1678694', - 'GCA_946183335': '157472'} - expected_mgyg_to_gca = {'MGYG000304400': 'GCA_946182765', - 'MGYG000304401': 'GCA_946182785', - 'MGYG000304403': 'GCA_946182825', - 'MGYG000304404': 'GCA_946182815', - 'MGYG000304405': 'GCA_946182805', - 'MGYG000304406': 'GCA_946182835', - 'MGYG000304407': 'GCA_946183145', - 'MGYG000304408': 'GCA_946183335'} + expected_gca_to_taxid = { + "GCA_946182765": "370804", + "GCA_946182785": "244328", + "GCA_946182825": "297314", + "GCA_946182815": "370804", + "GCA_946182805": "190764", + "GCA_946182835": "370804", + "GCA_946183145": "1678694", + "GCA_946183335": "157472", + } + expected_mgyg_to_gca = { + "MGYG000304400": "GCA_946182765", + "MGYG000304401": "GCA_946182785", + "MGYG000304403": "GCA_946182825", + "MGYG000304404": "GCA_946182815", + "MGYG000304405": "GCA_946182805", + "MGYG000304406": "GCA_946182835", + "MGYG000304407": "GCA_946183145", + "MGYG000304408": "GCA_946183335", + } self.assertEqual(self.mgyg_to_gca, expected_mgyg_to_gca) self.assertEqual(self.gca_to_taxid, expected_gca_to_taxid) - + def test_get_species_level_taxonomy(self): - taxid1, name1, submittable1, lineage1 = get_species_level_taxonomy( - "k__Bacteria;p__Bacillota_I;c__Bacilli_A;o__RFN20;f__CAG-826;g__UBA3207;s__UBA3207 sp946183335") - self.assertEqual(lineage1, "d__Bacteria;p__Bacillota_I;c__Bacilli_A;o__RFN20;f__CAG-826;g__UBA3207;s__UBA3207 sp946183335") + _, name1, submittable1, lineage1 = get_species_level_taxonomy( + "k__Bacteria;p__Bacillota_I;c__Bacilli_A;o__RFN20;f__CAG-826;g__UBA3207;s__UBA3207 sp946183335" + ) + self.assertEqual( + lineage1, + "d__Bacteria;p__Bacillota_I;c__Bacilli_A;o__RFN20;f__CAG-826;g__UBA3207;s__UBA3207 sp946183335", + ) self.assertEqual(submittable1, False) self.assertEqual(name1, "UBA3207 sp946183335") - - taxid2, name2, submittable2, lineage2 = get_species_level_taxonomy( - "d__Bacteria; p__Pseudomonadota; c__Gammaproteobacteria; o__Enterobacterales; f__Enterobacteriaceae; g__Escherichia; s__Escherichia coli") + + taxid2, _, submittable2, _ = get_species_level_taxonomy( + "d__Bacteria; p__Pseudomonadota; c__Gammaproteobacteria; o__Enterobacterales; f__Enterobacteriaceae; g__Escherichia; s__Escherichia coli" + ) self.assertEqual(submittable2, True) - self.assertEqual(taxid2, '562') - - - - + self.assertEqual(taxid2, "562")