Skip to content

Commit

Permalink
Minor adjustments on python tests
Browse files Browse the repository at this point in the history
Adjusted pytest to include the required folders in the pythonpath
  • Loading branch information
mberacochea committed Nov 6, 2024
1 parent ce6999c commit c65c9e4
Show file tree
Hide file tree
Showing 15 changed files with 76 additions and 60 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python_unitests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ jobs:
pip install -U -r requirements-test.txt
- name: Testing
run: |
PYTHONPATH="$PYTHONPATH:bin:helpers/database-import-scripts/uniprot" pytest
pytest
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ def main(rfam_file, output):
if rfam_lengths[name] == model_length:
pass
else:
print("ERROR: same name occurs multiple times and lengths do not match", name)
print(
"ERROR: same name occurs multiple times and lengths do not match",
name,
)
else:
rfam_lengths[name] = model_length
name = ""
Expand All @@ -30,17 +33,27 @@ def main(rfam_file, output):


def parse_args():
parser = argparse.ArgumentParser(description="Script produces a file with lengths of Rfam covariance models."
"The resulting file is necessary for running the JSON generating"
"script and only needs to be produced once unless the Rfam.cm"
"file is updated.")
parser.add_argument('-r', '--rfam-file', required=True,
help='Path to the Rfam.cm file containing concatinated Rfam models.')
parser.add_argument('-o', '--output', required=True,
help='Path to the output file where the lengths of models will be saved to.')
parser = argparse.ArgumentParser(
description="Script produces a file with lengths of Rfam covariance models."
"The resulting file is necessary for running the JSON generating"
"script and only needs to be produced once unless the Rfam.cm"
"file is updated."
)
parser.add_argument(
"-r",
"--rfam-file",
required=True,
help="Path to the Rfam.cm file containing concatinated Rfam models.",
)
parser.add_argument(
"-o",
"--output",
required=True,
help="Path to the output file where the lengths of models will be saved to.",
)
return parser.parse_args()


if __name__ == '__main__':
if __name__ == "__main__":
args = parse_args()
main(args.rfam_file, args.output)
Empty file.
10 changes: 5 additions & 5 deletions helpers/file_organiser.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ function GenerateRNACentralJSON {

echo "Running JSON generation"
mitload miniconda && conda activate pybase
python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database-import-scripts/rnacentral/generate_rnacentral_json.py \
-r /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database-import-scripts/rnacentral/rfam_model_lengths_14.9.txt \
python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database_import_scripts/rnacentral/generate_rnacentral_json.py \
-r /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database_import_scripts/rnacentral/rfam_model_lengths_14.9.txt \
-m ${RESULTS_PATH}/genomes-all_metadata.tsv -o ${RESULTS_PATH}/additional_data/rnacentral/${CATALOGUE_FOLDER}-rnacentral.json \
-d ${RESULTS_PATH}/additional_data/ncrna_deoverlapped_species_reps/ -g ${RESULTS_PATH}/additional_data/rnacentral/GFFs/ \
-f ${RESULTS_PATH}/additional_data/mgyg_genomes/
Expand Down Expand Up @@ -121,19 +121,19 @@ function GenerateUniprotFiles {
fi

mitload miniconda && conda activate pybase
python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database-import-scripts/uniprot/preprocess_taxonomy_for_uniprot.py \
python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database_import_scripts/uniprot/preprocess_taxonomy_for_uniprot.py \
-g ${RESULTS_PATH}/additional_data/gtdb-tk_output/ -r "r214" -v "2" -o ${RESULTS_PATH}/additional_data/uniprot/preprocessed_taxonomy.tsv

echo "Generating Uniprot files"
ACCS=$(ls ${RESULTS_PATH}/additional_data/prokka_gbk_species_reps/${F}.gbk | rev | cut -d '/' -f1 | rev | sed "s/\.gbk//")

for F in $ACCS; do python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database-import-scripts/uniprot/convert_gbk.py \
for F in $ACCS; do python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database_import_scripts/uniprot/convert_gbk.py \
-g ${RESULTS_PATH}/additional_data/prokka_gbk_species_reps/${F}.gbk \
-o ${RESULTS_PATH}/additional_data/uniprot/uniprot-files/${F}_uniprot.gbk \
-t ${RESULTS_PATH}/additional_data/uniprot/preprocessed_taxonomy.tsv; done

echo "Generating Uniprot metadata"
python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database-import-scripts/uniprot/generate_uniprot_metadata.py \
python3 /nfs/production/rdf/metagenomics/pipelines/prod/genomes-pipeline/helpers/database_import_scripts/uniprot/generate_uniprot_metadata.py \
-m ${RESULTS_PATH}/genomes-all_metadata.tsv -o ${RESULTS_PATH}/additional_data/uniprot/${CATALOGUE_FOLDER}_${CATALOGUE_VERSION}_uniprot_metadata.tsv \
-p ${RESULTS_PATH}/additional_data/uniprot/preprocessed_taxonomy.tsv

Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[tool.pytest.ini_options]
pythonpath = [".", "tests"]
pythonpath = [".", "bin", "helpers/database_import_scripts/uniprot", "tests"]
python_files = ["test_*.py"]

[tool.ruff]
Expand All @@ -10,4 +10,4 @@ profile = "black"

[tool.back]
line-length = 110
experimental-string-processing = true
experimental-string-processing = true
87 changes: 45 additions & 42 deletions tests/scripts/test_preprocess_taxonomy_for_uniprot.py
Original file line number Diff line number Diff line change
@@ -1,58 +1,61 @@
import importlib
import unittest
import os

from pathlib import Path

script_dir = Path(__file__).resolve().parent
uniprot_dir = script_dir / '../../helpers/database-import-scripts/uniprot'
uniprot_dir = uniprot_dir.resolve()

preprocess_taxonomy = importlib.import_module("helpers.database-import-scripts.uniprot.preprocess_taxonomy_for_uniprot")
parse_metadata =preprocess_taxonomy.parse_metadata
match_taxid_to_gca = preprocess_taxonomy.match_taxid_to_gca
get_species_level_taxonomy = preprocess_taxonomy.get_species_level_taxonomy
from helpers.database_import_scripts.uniprot.preprocess_taxonomy_for_uniprot import (
get_species_level_taxonomy,
match_taxid_to_gca,
parse_metadata,
)


class TestPreprocessTaxonomyForUniprot(unittest.TestCase):
def setUp(self):
self.metadata_file = os.path.join(script_dir, "fixtures", "preprocess_taxonomy_for_uniprot",
"sample_metadata.tsv")
self.metadata_file = (
Path(__file__).resolve().parent
/ "fixtures/preprocess_taxonomy_for_uniprot/sample_metadata.tsv"
)
self.gca_accessions, self.sample_accessions = parse_metadata(self.metadata_file)
self.mgyg_to_gca, self.gca_to_taxid = match_taxid_to_gca(self.gca_accessions, self.sample_accessions, 2)
self.mgyg_to_gca, self.gca_to_taxid = match_taxid_to_gca(
self.gca_accessions, self.sample_accessions, 2
)

def test_match_taxid_to_gca(self):
expected_gca_to_taxid = {'GCA_946182765': '370804',
'GCA_946182785': '244328',
'GCA_946182825': '297314',
'GCA_946182815': '370804',
'GCA_946182805': '190764',
'GCA_946182835': '370804',
'GCA_946183145': '1678694',
'GCA_946183335': '157472'}
expected_mgyg_to_gca = {'MGYG000304400': 'GCA_946182765',
'MGYG000304401': 'GCA_946182785',
'MGYG000304403': 'GCA_946182825',
'MGYG000304404': 'GCA_946182815',
'MGYG000304405': 'GCA_946182805',
'MGYG000304406': 'GCA_946182835',
'MGYG000304407': 'GCA_946183145',
'MGYG000304408': 'GCA_946183335'}
expected_gca_to_taxid = {
"GCA_946182765": "370804",
"GCA_946182785": "244328",
"GCA_946182825": "297314",
"GCA_946182815": "370804",
"GCA_946182805": "190764",
"GCA_946182835": "370804",
"GCA_946183145": "1678694",
"GCA_946183335": "157472",
}
expected_mgyg_to_gca = {
"MGYG000304400": "GCA_946182765",
"MGYG000304401": "GCA_946182785",
"MGYG000304403": "GCA_946182825",
"MGYG000304404": "GCA_946182815",
"MGYG000304405": "GCA_946182805",
"MGYG000304406": "GCA_946182835",
"MGYG000304407": "GCA_946183145",
"MGYG000304408": "GCA_946183335",
}
self.assertEqual(self.mgyg_to_gca, expected_mgyg_to_gca)
self.assertEqual(self.gca_to_taxid, expected_gca_to_taxid)

def test_get_species_level_taxonomy(self):
taxid1, name1, submittable1, lineage1 = get_species_level_taxonomy(
"k__Bacteria;p__Bacillota_I;c__Bacilli_A;o__RFN20;f__CAG-826;g__UBA3207;s__UBA3207 sp946183335")
self.assertEqual(lineage1, "d__Bacteria;p__Bacillota_I;c__Bacilli_A;o__RFN20;f__CAG-826;g__UBA3207;s__UBA3207 sp946183335")
_, name1, submittable1, lineage1 = get_species_level_taxonomy(
"k__Bacteria;p__Bacillota_I;c__Bacilli_A;o__RFN20;f__CAG-826;g__UBA3207;s__UBA3207 sp946183335"
)
self.assertEqual(
lineage1,
"d__Bacteria;p__Bacillota_I;c__Bacilli_A;o__RFN20;f__CAG-826;g__UBA3207;s__UBA3207 sp946183335",
)
self.assertEqual(submittable1, False)
self.assertEqual(name1, "UBA3207 sp946183335")

taxid2, name2, submittable2, lineage2 = get_species_level_taxonomy(
"d__Bacteria; p__Pseudomonadota; c__Gammaproteobacteria; o__Enterobacterales; f__Enterobacteriaceae; g__Escherichia; s__Escherichia coli")

taxid2, _, submittable2, _ = get_species_level_taxonomy(
"d__Bacteria; p__Pseudomonadota; c__Gammaproteobacteria; o__Enterobacterales; f__Enterobacteriaceae; g__Escherichia; s__Escherichia coli"
)
self.assertEqual(submittable2, True)
self.assertEqual(taxid2, '562')




self.assertEqual(taxid2, "562")

0 comments on commit c65c9e4

Please sign in to comment.