Minor adjustments on python tests

Adjusted pytest to include the required folders in the pythonpath
EBI-Metagenomics · Nov 6, 2024 · c65c9e4 · c65c9e4
1 parent ce6999c
commit c65c9e4
Show file tree

Hide file tree

Showing 15 changed files with 76 additions and 60 deletions.
diff --git a/.github/workflows/python_unitests.yml b/.github/workflows/python_unitests.yml
@@ -20,4 +20,4 @@ jobs:
         pip install -U -r requirements-test.txt
     - name: Testing
       run: |
-        PYTHONPATH="$PYTHONPATH:bin:helpers/database-import-scripts/uniprot" pytest
+        pytest
diff --git a/...pts/rnacentral/calculate_model_lengths.py → ...pts/rnacentral/calculate_model_lengths.py b/...pts/rnacentral/calculate_model_lengths.py → ...pts/rnacentral/calculate_model_lengths.py
@@ -20,7 +20,10 @@ def main(rfam_file, output):
                     if rfam_lengths[name] == model_length:
                         pass
                     else:
-                        print("ERROR: same name occurs multiple times and lengths do not match", name)
+                        print(
+                            "ERROR: same name occurs multiple times and lengths do not match",
+                            name,
+                        )
                 else:
                     rfam_lengths[name] = model_length
                     name = ""
@@ -30,17 +33,27 @@ def main(rfam_file, output):
 
 
 def parse_args():
-    parser = argparse.ArgumentParser(description="Script produces a file with lengths of Rfam covariance models."
-                                                 "The resulting file is necessary for running the JSON generating"
-                                                 "script and only needs to be produced once unless the Rfam.cm"
-                                                 "file is updated.")
-    parser.add_argument('-r', '--rfam-file', required=True,
-                        help='Path to the Rfam.cm file containing concatinated Rfam models.')
-    parser.add_argument('-o', '--output', required=True,
-                        help='Path to the output file where the lengths of models will be saved to.')
+    parser = argparse.ArgumentParser(
+        description="Script produces a file with lengths of Rfam covariance models."
+        "The resulting file is necessary for running the JSON generating"
+        "script and only needs to be produced once unless the Rfam.cm"
+        "file is updated."
+    )
+    parser.add_argument(
+        "-r",
+        "--rfam-file",
+        required=True,
+        help="Path to the Rfam.cm file containing concatinated Rfam models.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        required=True,
+        help="Path to the output file where the lengths of models will be saved to.",
+    )
     return parser.parse_args()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     args = parse_args()
     main(args.rfam_file, args.output)
diff --git a/...ts/rnacentral/generate_rnacentral_json.py → ...ts/rnacentral/generate_rnacentral_json.py b/...ts/rnacentral/generate_rnacentral_json.py → ...ts/rnacentral/generate_rnacentral_json.py
diff --git a/...scripts/rnacentral/rfam_model_lengths.txt → ...scripts/rnacentral/rfam_model_lengths.txt b/...scripts/rnacentral/rfam_model_lengths.txt → ...scripts/rnacentral/rfam_model_lengths.txt
diff --git a/...ts/rnacentral/rfam_model_lengths_14.9.txt → ...ts/rnacentral/rfam_model_lengths_14.9.txt b/...ts/rnacentral/rfam_model_lengths_14.9.txt → ...ts/rnacentral/rfam_model_lengths_14.9.txt
diff --git a/helpers/database_import_scripts/uniprot/__init__.py b/helpers/database_import_scripts/uniprot/__init__.py
diff --git a/...ase-import-scripts/uniprot/convert_gbk.py → ...ase_import_scripts/uniprot/convert_gbk.py b/...ase-import-scripts/uniprot/convert_gbk.py → ...ase_import_scripts/uniprot/convert_gbk.py
diff --git a/...ipts/uniprot/generate_uniprot_metadata.py → ...ipts/uniprot/generate_uniprot_metadata.py b/...ipts/uniprot/generate_uniprot_metadata.py → ...ipts/uniprot/generate_uniprot_metadata.py
diff --git a/...pts/uniprot/gtdb_to_ncbi_majority_vote.py → ...pts/uniprot/gtdb_to_ncbi_majority_vote.py b/...pts/uniprot/gtdb_to_ncbi_majority_vote.py → ...pts/uniprot/gtdb_to_ncbi_majority_vote.py
diff --git a/.../uniprot/gtdb_to_ncbi_majority_vote_v2.py → .../uniprot/gtdb_to_ncbi_majority_vote_v2.py b/.../uniprot/gtdb_to_ncbi_majority_vote_v2.py → .../uniprot/gtdb_to_ncbi_majority_vote_v2.py
diff --git a/...niprot/preprocess_taxonomy_for_uniprot.py → ...niprot/preprocess_taxonomy_for_uniprot.py b/...niprot/preprocess_taxonomy_for_uniprot.py → ...niprot/preprocess_taxonomy_for_uniprot.py
diff --git a/...t-scripts/uniprot/uniprot_sanity_check.py → ...t_scripts/uniprot/uniprot_sanity_check.py b/...t-scripts/uniprot/uniprot_sanity_check.py → ...t_scripts/uniprot/uniprot_sanity_check.py
diff --git a/helpers/file_organiser.sh b/helpers/file_organiser.sh
@@ -45,8 +45,8 @@ function GenerateRNACentralJSON {
 
     echo "Running JSON generation"
     mitload miniconda && conda activate pybase
-    python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database-import-scripts/rnacentral/generate_rnacentral_json.py \
-    -r /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database-import-scripts/rnacentral/rfam_model_lengths_14.9.txt \
+    python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database_import_scripts/rnacentral/generate_rnacentral_json.py \
+    -r /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database_import_scripts/rnacentral/rfam_model_lengths_14.9.txt \
     -m ${RESULTS_PATH}/genomes-all_metadata.tsv -o ${RESULTS_PATH}/additional_data/rnacentral/${CATALOGUE_FOLDER}-rnacentral.json \
     -d ${RESULTS_PATH}/additional_data/ncrna_deoverlapped_species_reps/ -g ${RESULTS_PATH}/additional_data/rnacentral/GFFs/ \
      -f ${RESULTS_PATH}/additional_data/mgyg_genomes/
@@ -121,19 +121,19 @@ function GenerateUniprotFiles {
     fi
 
     mitload miniconda && conda activate pybase
-    python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database-import-scripts/uniprot/preprocess_taxonomy_for_uniprot.py \
+    python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database_import_scripts/uniprot/preprocess_taxonomy_for_uniprot.py \
     -g ${RESULTS_PATH}/additional_data/gtdb-tk_output/ -r "r214" -v "2" -o ${RESULTS_PATH}/additional_data/uniprot/preprocessed_taxonomy.tsv
 
     echo "Generating Uniprot files"
     ACCS=$(ls ${RESULTS_PATH}/additional_data/prokka_gbk_species_reps/${F}.gbk | rev | cut -d '/' -f1 | rev | sed "s/\.gbk//")
 
-    for F in $ACCS; do python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database-import-scripts/uniprot/convert_gbk.py \
+    for F in $ACCS; do python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database_import_scripts/uniprot/convert_gbk.py \
     -g ${RESULTS_PATH}/additional_data/prokka_gbk_species_reps/${F}.gbk \
     -o ${RESULTS_PATH}/additional_data/uniprot/uniprot-files/${F}_uniprot.gbk \
     -t ${RESULTS_PATH}/additional_data/uniprot/preprocessed_taxonomy.tsv; done
 
     echo "Generating Uniprot metadata"
-    python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database-import-scripts/uniprot/generate_uniprot_metadata.py \
+    python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database_import_scripts/uniprot/generate_uniprot_metadata.py \
     -m ${RESULTS_PATH}/genomes-all_metadata.tsv -o ${RESULTS_PATH}/additional_data/uniprot/${CATALOGUE_FOLDER}_${CATALOGUE_VERSION}_uniprot_metadata.tsv \
     -p ${RESULTS_PATH}/additional_data/uniprot/preprocessed_taxonomy.tsv
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,5 @@
 [tool.pytest.ini_options]
-pythonpath = [".", "tests"]
+pythonpath = [".", "bin", "helpers/database_import_scripts/uniprot", "tests"]
 python_files = ["test_*.py"]
 
 [tool.ruff]
@@ -10,4 +10,4 @@ profile = "black"
 
 [tool.back]
 line-length = 110
-experimental-string-processing = true
+experimental-string-processing = true
diff --git a/tests/scripts/test_preprocess_taxonomy_for_uniprot.py b/tests/scripts/test_preprocess_taxonomy_for_uniprot.py
@@ -1,58 +1,61 @@
-import importlib
 import unittest
-import os
-
 from pathlib import Path
 
-script_dir = Path(__file__).resolve().parent
-uniprot_dir = script_dir / '../../helpers/database-import-scripts/uniprot'
-uniprot_dir = uniprot_dir.resolve()
-
-preprocess_taxonomy = importlib.import_module("helpers.database-import-scripts.uniprot.preprocess_taxonomy_for_uniprot")
-parse_metadata =preprocess_taxonomy.parse_metadata
-match_taxid_to_gca = preprocess_taxonomy.match_taxid_to_gca
-get_species_level_taxonomy = preprocess_taxonomy.get_species_level_taxonomy
+from helpers.database_import_scripts.uniprot.preprocess_taxonomy_for_uniprot import (
+    get_species_level_taxonomy,
+    match_taxid_to_gca,
+    parse_metadata,
+)
 
 
 class TestPreprocessTaxonomyForUniprot(unittest.TestCase):
     def setUp(self):
-        self.metadata_file = os.path.join(script_dir, "fixtures", "preprocess_taxonomy_for_uniprot", 
-                                          "sample_metadata.tsv")
+        self.metadata_file = (
+            Path(__file__).resolve().parent
+            / "fixtures/preprocess_taxonomy_for_uniprot/sample_metadata.tsv"
+        )
         self.gca_accessions, self.sample_accessions = parse_metadata(self.metadata_file)
-        self.mgyg_to_gca, self.gca_to_taxid = match_taxid_to_gca(self.gca_accessions, self.sample_accessions, 2)
+        self.mgyg_to_gca, self.gca_to_taxid = match_taxid_to_gca(
+            self.gca_accessions, self.sample_accessions, 2
+        )
 
     def test_match_taxid_to_gca(self):
-        expected_gca_to_taxid = {'GCA_946182765': '370804', 
-                                 'GCA_946182785': '244328', 
-                                 'GCA_946182825': '297314', 
-                                 'GCA_946182815': '370804', 
-                                 'GCA_946182805': '190764', 
-                                 'GCA_946182835': '370804', 
-                                 'GCA_946183145': '1678694', 
-                                 'GCA_946183335': '157472'}
-        expected_mgyg_to_gca = {'MGYG000304400': 'GCA_946182765', 
-                                'MGYG000304401': 'GCA_946182785', 
-                                'MGYG000304403': 'GCA_946182825', 
-                                'MGYG000304404': 'GCA_946182815', 
-                                'MGYG000304405': 'GCA_946182805', 
-                                'MGYG000304406': 'GCA_946182835', 
-                                'MGYG000304407': 'GCA_946183145', 
-                                'MGYG000304408': 'GCA_946183335'}
+        expected_gca_to_taxid = {
+            "GCA_946182765": "370804",
+            "GCA_946182785": "244328",
+            "GCA_946182825": "297314",
+            "GCA_946182815": "370804",
+            "GCA_946182805": "190764",
+            "GCA_946182835": "370804",
+            "GCA_946183145": "1678694",
+            "GCA_946183335": "157472",
+        }
+        expected_mgyg_to_gca = {
+            "MGYG000304400": "GCA_946182765",
+            "MGYG000304401": "GCA_946182785",
+            "MGYG000304403": "GCA_946182825",
+            "MGYG000304404": "GCA_946182815",
+            "MGYG000304405": "GCA_946182805",
+            "MGYG000304406": "GCA_946182835",
+            "MGYG000304407": "GCA_946183145",
+            "MGYG000304408": "GCA_946183335",
+        }
         self.assertEqual(self.mgyg_to_gca, expected_mgyg_to_gca)
         self.assertEqual(self.gca_to_taxid, expected_gca_to_taxid)
-    
+
     def test_get_species_level_taxonomy(self):
-        taxid1, name1, submittable1, lineage1 = get_species_level_taxonomy(
-            "k__Bacteria;p__Bacillota_I;c__Bacilli_A;o__RFN20;f__CAG-826;g__UBA3207;s__UBA3207 sp946183335")
-        self.assertEqual(lineage1, "d__Bacteria;p__Bacillota_I;c__Bacilli_A;o__RFN20;f__CAG-826;g__UBA3207;s__UBA3207 sp946183335")
+        _, name1, submittable1, lineage1 = get_species_level_taxonomy(
+            "k__Bacteria;p__Bacillota_I;c__Bacilli_A;o__RFN20;f__CAG-826;g__UBA3207;s__UBA3207 sp946183335"
+        )
+        self.assertEqual(
+            lineage1,
+            "d__Bacteria;p__Bacillota_I;c__Bacilli_A;o__RFN20;f__CAG-826;g__UBA3207;s__UBA3207 sp946183335",
+        )
         self.assertEqual(submittable1, False)
         self.assertEqual(name1, "UBA3207 sp946183335")
-
-        taxid2, name2, submittable2, lineage2 = get_species_level_taxonomy(
-            "d__Bacteria; p__Pseudomonadota; c__Gammaproteobacteria; o__Enterobacterales; f__Enterobacteriaceae; g__Escherichia; s__Escherichia coli")
+
+        taxid2, _, submittable2, _ = get_species_level_taxonomy(
+            "d__Bacteria; p__Pseudomonadota; c__Gammaproteobacteria; o__Enterobacterales; f__Enterobacteriaceae; g__Escherichia; s__Escherichia coli"
+        )
         self.assertEqual(submittable2, True)
-        self.assertEqual(taxid2, '562')
-
-
-
-
+        self.assertEqual(taxid2, "562")