From 5fcf179cf366ab82b31540d6a40124dc53b86adc Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Thu, 25 Jan 2024 13:42:08 +0000 Subject: [PATCH 01/26] Project restructure WIP --- .github/workflows/test.yml | 24 + .gitignore | 81 +++ .pre-commit-config.yaml | 20 + LICENSE.md | 2 +- genomeuploader/__init__.py | 1 + genomeuploader/constants.py | 637 ++++++++++++++++++ genomeuploader/ena.py | 310 +++++++++ .../genome_upload.py | 604 +++-------------- pyproject.toml | 86 +++ pytest.ini | 3 + tests/test_dummy.py | 5 + 11 files changed, 1274 insertions(+), 499 deletions(-) create mode 100644 .github/workflows/test.yml create mode 100644 .pre-commit-config.yaml create mode 100644 genomeuploader/__init__.py create mode 100644 genomeuploader/constants.py create mode 100644 genomeuploader/ena.py rename genome_upload.py => genomeuploader/genome_upload.py (53%) create mode 100644 pyproject.toml create mode 100644 pytest.ini create mode 100644 tests/test_dummy.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..05542ac --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,24 @@ +name: Testing + +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.8, 3.9, "3.10"] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Dependencies + run: | + pip install .[test] + - name: 🧪 - Testing + run: | + pytest -v diff --git a/.gitignore b/.gitignore index ca2e7f3..8fe6e30 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,82 @@ cluster_uploader_wrapper.py + +#IntelliJ project structure files +*.iml +*.xml +.idea/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Rope project settings +.ropeproject + +# VSCode +.vscode/ + +# Ruff +.ruff_cache/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..becc121 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,20 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: end-of-file-fixer + - id: trailing-whitespace + - repo: https://github.com/psf/black + rev: 23.7.0 + hooks: + - id: black + - repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort + name: isort (python) + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.0.286 + hooks: + - id: ruff + args: [--fix, --exit-non-zero-on-fix, --show-fixes] diff --git a/LICENSE.md b/LICENSE.md index 73fe4f6..a3d61d5 100755 --- a/LICENSE.md +++ b/LICENSE.md @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2017-2022 EMBL-EBI + Copyright 2017-2024 EMBL-EBI Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/genomeuploader/__init__.py b/genomeuploader/__init__.py new file mode 100644 index 0000000..aced243 --- /dev/null +++ b/genomeuploader/__init__.py @@ -0,0 +1 @@ +__version__ = "0.0.1" # TODO: pin the correct version \ No newline at end of file diff --git a/genomeuploader/constants.py b/genomeuploader/constants.py new file mode 100644 index 0000000..11d9a5e --- /dev/null +++ b/genomeuploader/constants.py @@ -0,0 +1,637 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2017-2024 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +HQ = ( + "Multiple fragments where gaps span repetitive regions. Presence of the " + "23S, 16S, and 5S rRNA genes and at least 18 tRNAs." +) +MQ = ( + "Many fragments with little to no review of assembly other than reporting " + "of standard assembly statistics." +) + +METAGENOMES = [ + "activated carbon metagenome", + "activated sludge metagenome", + "aerosol metagenome", + "air metagenome", + "algae metagenome", + "alkali sediment metagenome", + "amphibian metagenome", + "anaerobic digester metagenome", + "anchialine metagenome", + "annelid metagenome", + "ant fungus garden metagenome", + "ant metagenome", + "aquaculture metagenome", + "aquatic eukaryotic metagenome", + "aquatic metagenome", + "aquatic viral metagenome", + "aquifer metagenome", + "ballast water metagenome", + "bat gut metagenome", + "bat metagenome", + "beach sand metagenome", + "beetle metagenome", + "bentonite metagenome", + "bioanode metagenome", + "biocathode metagenome", + "biofilm metagenome", + "biofilter metagenome", + "biofloc metagenome", + "biogas fermenter metagenome", + "bioleaching metagenome", + "bioreactor metagenome", + "bioreactor sludge metagenome", + "bioretention column metagenome", + "biosolids metagenome", + "bird metagenome", + "blood metagenome", + "bog metagenome", + "book metagenome", + "bovine gut metagenome", + "bovine metagenome", + "brine metagenome", + "canine metagenome", + "cave metagenome", + "cetacean metagenome", + "chemical production metagenome", + "chicken gut metagenome", + "ciliate metagenome", + "clay metagenome", + "clinical metagenome", + "cloud metagenome", + "coal metagenome", + "cold seep metagenome", + "cold spring metagenome", + "compost metagenome", + "concrete metagenome", + "coral metagenome", + "coral reef metagenome", + "cow dung metagenome", + "crab metagenome", + "crude oil metagenome", + "Crustacea gut metagenome", + "crustacean metagenome", + "ctenophore metagenome", + "decomposition metagenome", + "desalination cell metagenome", + "dietary supplements metagenome", + "dinoflagellate metagenome", + "drinking water metagenome", + "dust metagenome", + "ear metagenome", + "echinoderm metagenome", + "egg metagenome", + "electrolysis cell metagenome", + "endophyte metagenome", + "epibiont metagenome", + "estuary metagenome", + "eukaryotic metagenome", + "eukaryotic plankton metagenome", + "eye metagenome", + "factory metagenome", + "feces metagenome", + "feline metagenome", + "fermentation metagenome", + "fertilizer metagenome", + "fish gut metagenome", + "fishing equipment metagenome", + "fish metagenome", + "floral nectar metagenome", + "flotsam metagenome", + "flower metagenome", + "food contamination metagenome", + "food fermentation metagenome", + "food metagenome", + "food production metagenome", + "fossil metagenome", + "freshwater metagenome", + "freshwater sediment metagenome", + "frog metagenome", + "fuel tank metagenome", + "fungus metagenome", + "gas well metagenome", + "gill metagenome", + "glacier lake metagenome", + "glacier metagenome", + "gonad metagenome", + "grain metagenome", + "granuloma metagenome", + "groundwater metagenome", + "gut metagenome", + "halite metagenome", + "herbal medicine metagenome", + "honeybee metagenome", + "honey metagenome", + "horse metagenome", + "hospital metagenome", + "hot springs metagenome", + "human bile metagenome", + "human blood metagenome", + "human brain metagenome", + "human eye metagenome", + "human feces metagenome", + "human gut metagenome", + "human hair metagenome", + "human lung metagenome", + "human metagenome", + "human milk metagenome", + "human nasopharyngeal metagenome", + "human oral metagenome", + "human reproductive system metagenome", + "human saliva metagenome", + "human semen metagenome", + "human skeleton metagenome", + "human skin metagenome", + "human sputum metagenome", + "human tracheal metagenome", + "human urinary tract metagenome", + "human vaginal metagenome", + "human viral metagenome", + "HVAC metagenome", + "hydrocarbon metagenome", + "hydrothermal vent metagenome", + "hydrozoan metagenome", + "hypersaline lake metagenome", + "hyphosphere metagenome", + "hypolithon metagenome", + "ice metagenome", + "indoor metagenome", + "industrial waste metagenome", + "insect gut metagenome", + "insect metagenome", + "insect nest metagenome", + "internal organ metagenome", + "interstitial water metagenome", + "invertebrate gut metagenome", + "invertebrate metagenome", + "jellyfish metagenome", + "karst metagenome", + "koala metagenome", + "lagoon metagenome", + "lake water metagenome", + "landfill metagenome", + "leaf litter metagenome", + "leaf metagenome", + "lichen crust metagenome", + "lichen metagenome", + "liver metagenome", + "lung metagenome", + "macroalgae metagenome", + "mangrove metagenome", + "manure metagenome", + "marine metagenome", + "marine plankton metagenome", + "marine sediment metagenome", + "marsh metagenome", + "marsupial metagenome", + "medical device metagenome", + "metagenome", + "microbial eukaryotic metagenome", + "microbial fuel cell metagenome", + "microbial mat metagenome", + "microeukaryotic metagenome", + "milk metagenome", + "mine drainage metagenome", + "mine metagenome", + "mine tailings metagenome", + "mite metagenome", + "mixed culture metagenome", + "mollusc metagenome", + "money metagenome", + "moonmilk metagenome", + "mosquito metagenome", + "moss metagenome", + "mouse gut metagenome", + "mouse metagenome", + "mouse skin metagenome", + "mud metagenome", + "museum specimen metagenome", + "musk metagenome", + "nematode metagenome", + "neuston metagenome", + "nutrient bag metagenome", + "oasis metagenome", + "oil field metagenome", + "oil metagenome", + "oil production facility metagenome", + "oil sands metagenome", + "oral metagenome", + "oral-nasopharyngeal metagenome", + "oral viral metagenome", + "outdoor metagenome", + "ovine metagenome", + "oyster metagenome", + "painting metagenome", + "paper pulp metagenome", + "parasite metagenome", + "parchment metagenome", + "peat metagenome", + "periphyton metagenome", + "permafrost metagenome", + "photosynthetic picoeukaryotic metagenome", + "phycosphere metagenome", + "phyllosphere metagenome", + "phytotelma metagenome", + "pig gut metagenome", + "pig metagenome", + "pipeline metagenome", + "pitcher plant inquiline metagenome", + "placenta metagenome", + "plant metagenome", + "plastic metagenome", + "plastisphere metagenome", + "pollen metagenome", + "pond metagenome", + "poultry litter metagenome", + "power plant metagenome", + "primate metagenome", + "probiotic metagenome", + "protist metagenome", + "psyllid metagenome", + "rat gut metagenome", + "rat metagenome", + "reproductive system metagenome", + "respiratory tract metagenome", + "retting metagenome", + "rhizoplane metagenome", + "rhizosphere metagenome", + "rice paddy metagenome", + "riverine metagenome", + "rock metagenome", + "rock porewater metagenome", + "rodent metagenome", + "root associated fungus metagenome", + "root metagenome", + "runoff metagenome", + "saline spring metagenome", + "saltern metagenome", + "salt lake metagenome", + "salt marsh metagenome", + "salt mine metagenome", + "salt pan metagenome", + "sand metagenome", + "scorpion gut metagenome", + "sea anemone metagenome", + "seagrass metagenome", + "sea squirt metagenome", + "sea urchin metagenome", + "seawater metagenome", + "sediment metagenome", + "seed metagenome", + "semen metagenome", + "shale gas metagenome", + "sheep gut metagenome", + "sheep metagenome", + "shoot metagenome", + "shrew metagenome", + "shrimp gut metagenome", + "silage metagenome", + "skin metagenome", + "slag metagenome", + "sludge metagenome", + "snake metagenome", + "snow metagenome", + "soda lake metagenome", + "soda lime metagenome", + "soil crust metagenome", + "soil metagenome", + "solid waste metagenome", + "spider metagenome", + "sponge metagenome", + "starfish metagenome", + "steel metagenome", + "stomach metagenome", + "stromatolite metagenome", + "subsurface metagenome", + "surface metagenome", + "symbiont metagenome", + "synthetic metagenome", + "tannin metagenome", + "tar pit metagenome", + "termitarium metagenome", + "termite fungus garden metagenome", + "termite gut metagenome", + "termite metagenome", + "terrestrial metagenome", + "tick metagenome", + "tidal flat metagenome", + "tin mine metagenome", + "tobacco metagenome", + "tomb wall metagenome", + "tree metagenome", + "upper respiratory tract metagenome", + "urban metagenome", + "urinary tract metagenome", + "urine metagenome", + "urogenital metagenome", + "vaginal metagenome", + "viral metagenome", + "volcano metagenome", + "wallaby gut metagenome", + "wasp metagenome", + "wastewater metagenome", + "wetland metagenome", + "whale fall metagenome", + "whole organism metagenome", + "wine metagenome", + "Winogradsky column metagenome", + "wood decay metagenome", + "zebrafish metagenome", +] + +GEOGRAPHIC_LOCATIONS = [ + "Afghanistan", + "Albania", + "Algeria", + "American Samoa", + "Andorra", + "Angola", + "Anguilla", + "Antarctica", + "Antigua and Barbuda", + "Arctic Ocean", + "Argentina", + "Armenia", + "Aruba", + "Ashmore and Cartier Islands", + "Atlantic Ocean", + "Australia", + "Austria", + "Azerbaijan", + "Bahamas", + "Bahrain", + "Baker Island", + "Baltic Sea", + "Bangladesh", + "Barbados", + "Bassas da India", + "Belarus", + "Belgium", + "Belize", + "Benin", + "Bermuda", + "Bhutan", + "Bolivia", + "Borneo", + "Bosnia and Herzegovina", + "Botswana", + "Bouvet Island", + "Brazil", + "British Virgin Islands", + "Brunei", + "Bulgaria", + "Burkina Faso", + "Burundi", + "Cambodia", + "Cameroon", + "Canada", + "Cape Verde", + "Cayman Islands", + "Central African Republic", + "Chad", + "Chile", + "China", + "Christmas Island", + "Clipperton Island", + "Cocos Islands", + "Colombia", + "Comoros", + "Cook Islands", + "Coral Sea Islands", + "Costa Rica", + "Cote d'Ivoire", + "Croatia", + "Cuba", + "Curacao", + "Cyprus", + "Czech Republic", + "Democratic Republic of the Congo", + "Denmark", + "Djibouti", + "Dominica", + "Dominican Republic", + "East Timor", + "Ecuador", + "Egypt", + "El Salvador", + "Equatorial Guinea", + "Eritrea", + "Estonia", + "Ethiopia", + "Europa Island", + "Falkland Islands (Islas Malvinas)", + "Faroe Islands", + "Fiji", + "Finland", + "France", + "French Guiana", + "French Polynesia", + "French Southern and Antarctic Lands", + "Gabon", + "Gambia", + "Gaza Strip", + "Georgia", + "Germany", + "Ghana", + "Gibraltar", + "Glorioso Islands", + "Greece", + "Greenland", + "GrENAda", + "Guadeloupe", + "Guam", + "Guatemala", + "Guernsey", + "Guinea", + "Guinea-Bissau", + "Guyana", + "Haiti", + "Heard Island and McDonald Islands", + "Honduras", + "Hong Kong", + "Howland Island", + "Hungary", + "Iceland", + "India", + "Indian Ocean", + "Indonesia", + "Iran", + "Iraq", + "Ireland", + "Isle of Man", + "Israel", + "Italy", + "Jamaica", + "Jan Mayen", + "Japan", + "Jarvis Island", + "Jersey", + "Johnston Atoll", + "Jordan", + "Juan de Nova Island", + "Kazakhstan", + "Kenya", + "Kerguelen Archipelago", + "Kingman Reef", + "Kiribati", + "Kosovo", + "Kuwait", + "Kyrgyzstan", + "Laos", + "Latvia", + "Lebanon", + "Lesotho", + "Liberia", + "Libya", + "Liechtenstein", + "Lithuania", + "Luxembourg", + "Macau", + "Macedonia", + "Madagascar", + "Malawi", + "Malaysia", + "Maldives", + "Mali", + "Malta", + "Marshall Islands", + "Martinique", + "Mauritania", + "Mauritius", + "Mayotte", + "Mediterranean Sea", + "Mexico", + "Micronesia", + "Midway Islands", + "Moldova", + "Monaco", + "Mongolia", + "Montenegro", + "Montserrat", + "Morocco", + "Mozambique", + "Myanmar", + "Namibia", + "Nauru", + "Navassa Island", + "Nepal", + "Netherlands", + "New Caledonia", + "New Zealand", + "Nicaragua", + "Niger", + "Nigeria", + "Niue", + "Norfolk Island", + "Northern Mariana Islands", + "North Korea", + "North Sea", + "Norway", + "not applicable", + "not collected", + "not provided", + "Oman", + "Pacific Ocean", + "Pakistan", + "Palau", + "Palmyra Atoll", + "Panama", + "Papua New Guinea", + "Paracel Islands", + "Paraguay", + "Peru", + "Philippines", + "Pitcairn Islands", + "Poland", + "Portugal", + "Puerto Rico", + "Qatar", + "Republic of the Congo", + "restricted access", + "Reunion", + "Romania", + "Ross Sea", + "Russia", + "Rwanda", + "Saint HelENA", + "Saint Kitts and Nevis", + "Saint Lucia", + "Saint Pierre and Miquelon", + "Saint Vincent and the GrENAdines", + "Samoa", + "San Marino", + "Sao Tome and Principe", + "Saudi Arabia", + "Senegal", + "Serbia", + "Seychelles", + "Sierra Leone", + "Singapore", + "Sint Maarten", + "Slovakia", + "Slovenia", + "Solomon Islands", + "Somalia", + "South Africa", + "Southern Ocean", + "South Georgia and the South Sandwich Islands", + "South Korea", + "Spain", + "Spratly Islands", + "Sri Lanka", + "Sudan", + "Suriname", + "Svalbard", + "Swaziland", + "Sweden", + "Switzerland", + "Syria", + "Taiwan", + "Tajikistan", + "Tanzania", + "Tasman Sea", + "Thailand", + "Togo", + "Tokelau", + "Tonga", + "Trinidad and Tobago", + "Tromelin Island", + "Tunisia", + "Turkey", + "Turkmenistan", + "Turks and Caicos Islands", + "Tuvalu", + "Uganda", + "Ukraine", + "United Arab Emirates", + "United Kingdom", + "Uruguay", + "USA", + "Uzbekistan", + "Vanuatu", + "Venezuela", + "Viet Nam", + "Virgin Islands", + "Wake Island", + "Wallis and Futuna", + "West Bank", + "Western Sahara", + "Yemen", + "Zambia", + "Zimbabwe", +] diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py new file mode 100644 index 0000000..3542351 --- /dev/null +++ b/genomeuploader/ena.py @@ -0,0 +1,310 @@ + +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2017-2024 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import requests +import json +import logging +from time import sleep + +import xml.dom.minidom as minidom + +logging.basicConfig(level=logging.DEBUG) + +logger = logging.getLogger(__name__) + + + +RUN_DEFAULT_FIELDS = ','.join([ + 'study_accession', + 'secondary_study_accession', + 'instrument_model', + 'run_accession', + 'sample_accession' +]) + +ASSEMBLY_DEFAULT_FIELDS = 'sample_accession' + +SAMPLE_DEFAULT_FIELDS = ','.join([ + 'sample_accession', + 'secondary_sample_accession', + 'collection_date', + 'country', + 'location' +]) + +STUDY_DEFAULT_FIELDS = ','.join([ + 'study_accession', + 'secondary_study_accession', + 'description', + 'study_title' +]) + +RETRY_COUNT = 5 + + +class ENA(): + def get_default_params(self): + return { + 'format': 'json', + 'includeMetagenomes': True, + 'dataPortal': 'ena' + } + + def post_request(self, data, webin, password): + url = "https://www.ebi.ac.uk/ena/portal/api/search" + auth = (webin, password) + default_connection_headers = { + "Content-Type": "application/x-www-form-urlencoded", + "Accept": "*/*" + } + response = requests.post(url, data=data, auth=auth, headers=default_connection_headers) + + return response + + def get_run(self, run_accession, webin, password, attempt=0, search_params=None): + data = self.get_default_params() + data['result'] = 'read_run' + data['fields'] = RUN_DEFAULT_FIELDS + data['query'] = 'run_accession=\"{}\"'.format(run_accession) + + if search_params: + data.update(search_params) + + response = self.post_request(data, webin, password) + + if not response.ok and attempt > 2: + raise ValueError("Could not retrieve run with accession {}, returned " + "message: {}".format(run_accession, response.text)) + elif response.status_code == 204: + if attempt < 2: + attempt += 1 + sleep(1) + return self.get_run(run_accession, webin, password, attempt) + else: + raise ValueError("Could not find run {} in ENA after {}" + " attempts".format(run_accession, RETRY_COUNT)) + try: + run = json.loads(response.text)[0] + except (IndexError, TypeError, ValueError): + raise ValueError("Could not find run {} in ENA.".format(run_accession)) + except: + raise Exception("Could not query ENA API: {}".format(response.text)) + + return run + + def get_run_from_assembly(self, assembly_name): + manifestXml = minidom.parseString(requests.get("https://www.ebi.ac.uk" + + "/ena/browser/api/xml/" + assembly_name).text) + + run_ref = manifestXml.getElementsByTagName("RUN_REF") + run = run_ref[0].attributes["accession"].value + + return run + + def get_study(self, webin, password, primary_accession=None, secondary_accession=None): + data = self.get_default_params() + data['result'] = 'read_study' + data['fields'] = STUDY_DEFAULT_FIELDS + + if primary_accession and not secondary_accession: + data['query'] = 'study_accession="{}"'.format(primary_accession) + elif not primary_accession and secondary_accession: + data['query'] = 'secondary_study_accession="{}"'.format(secondary_accession) + else: + data['query'] = 'study_accession="{}" AND secondary_study_accession="{}"' \ + .format(primary_accession, secondary_accession) + + query_params = [] + for result_type in ['study', 'read_study', 'analysis_study']: + for data_portal in ['ena', 'metagenome']: + param = data.copy() + param['result'] = result_type + param['dataPortal'] = data_portal + if result_type == 'study': + if 'description' in param['fields']: + param['fields'] = param['fields'].replace('description', 'study_description') + query_params.append(param) + + for param in query_params: + try: + response = self.post_request(data, webin, password) + if response.status_code == 204: + raise NoDataException() + try: + study = json.loads(response.text)[0] + except (IndexError, TypeError, ValueError, KeyError) as e: + raise e + if data['result'] == 'study': + if 'study_description' in study: + study['description'] = study.pop('study_description') + return study + except NoDataException: + print("No info found to fetch study with params {}".format(param)) + pass + except (IndexError, TypeError, ValueError, KeyError): + print("Failed to fetch study with params {}, returned error: {}".format(param, response.text)) + + raise ValueError('Could not find study {} {} in ENA.'.format(primary_accession, secondary_accession)) + + def get_study_runs(self, study_acc, webin, password, fields=None, search_params=None): + data = self.get_default_params() + data['result'] = 'read_run' + data['fields'] = fields or RUN_DEFAULT_FIELDS + data['query'] = '(study_accession=\"{}\" OR secondary_study_accession=\"{}\")'.format(study_acc, study_acc) + + if search_params: + data.update(search_params) + + response = self.post_request(data, webin, password) + + if not response.ok: + raise ValueError("Could not retrieve runs for study %s.", study_acc) + + if response.status_code == 204: + return [] + + try: + runs = json.loads(response.text) + except: + raise ValueError("Query against ENA API did not work. Returned " + "message: {}".format(response.text)) + + return runs + + def get_sample(self, sample_accession, webin, password, fields=None, search_params=None, attempt=0): + data = self.get_default_params() + data['result'] = 'sample' + data['fields'] = fields or SAMPLE_DEFAULT_FIELDS + data['query'] = ('(sample_accession=\"{acc}\" OR secondary_sample_accession' + '=\"{acc}\") ').format(acc=sample_accession) + + if search_params: + data.update(search_params) + + response = self.post_request(data, webin, password) + + if response.status_code == 200: + return response.json() + + if response.status_code == 204: + if attempt < 2: + new_params = {'dataPortal': 'metagenome' if data['dataPortal'] == 'ena' else 'ena'} + attempt += 1 + return self.get_sample(sample_accession, webin, password, fields=fields, + search_params=new_params, attempt=attempt) + else: + raise ValueError("Could not find sample {} in ENA after " + "{} attempts.".format(sample_accession, RETRY_COUNT)) + else: + raise ValueError("Could not retrieve sample with accession {}. " + "Returned message: {}".format(sample_accession, response.text)) + + + def query_taxid(self, taxid): + url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{}".format(taxid) + response = requests.get(url) + + try: + # Will raise exception if response status code is non-200 + response.raise_for_status() + except requests.exceptions.HTTPError as e: + print("Request failed {} with error {}".format(url, e)) + return False + + res = response.json() + + return res.get("scientificName", "") + + def query_scientific_name(self, scientificName, searchRank=False): + url = "https://www.ebi.ac.uk/ena/taxonomy/rest/scientific-name/{}".format(scientificName) + response = requests.get(url) + + try: + # Will raise exception if response status code is non-200 + response.raise_for_status() + except requests.exceptions.HTTPError as e: + if searchRank: + return False, "", "" + else: + return False, "" + + try: + res = response.json()[0] + except IndexError: + if searchRank: + return False, "", "" + else: + return False, "" + + submittable = res.get("submittable", "").lower() == "true" + taxid = res.get("taxId", "") + rank = res.get("rank", "") + + if searchRank: + return submittable, taxid, rank + else: + return submittable, taxid + + def handle_genomes_registration(self, sample_xml, submission_xml, webin, password, live=False): + liveSub, mode = "", "live" + + if not live: + liveSub = "dev" + mode = "test" + + url = "https://www{}.ebi.ac.uk/ena/submit/drop-box/submit/".format(liveSub) + + logger.info('Registering sample xml in {} mode.'.format(mode)) + + f = { + 'SUBMISSION': open(submission_xml, 'r'), + 'SAMPLE': open(sample_xml, 'r') + } + + submissionResponse = requests.post(url, files = f, auth = (webin, password)) + + if submissionResponse.status_code != 200: + if str(submissionResponse.status_code).startswith('5'): + raise Exception("Genomes could not be submitted to ENA as the server " + + "does not respond. Please again try later.") + else: + raise Exception("Genomes could not be submitted to ENA. HTTP response: " + + submissionResponse.reason) + + receiptXml = minidom.parseString((submissionResponse.content).decode("utf-8")) + receipt = receiptXml.getElementsByTagName("RECEIPT") + success = receipt[0].attributes["success"].value + if success == "true": + aliasDict = {} + samples = receiptXml.getElementsByTagName("SAMPLE") + for s in samples: + sraAcc = s.attributes["accession"].value + alias = s.attributes["alias"].value + aliasDict[alias] = sraAcc + elif success == "false": + errors = receiptXml.getElementsByTagName("ERROR") + finalError = "\tSome genomes could not be submitted to ENA. Please, check the errors below." + for error in errors: + finalError += "\n\t" + error.firstChild.data + finalError += "\n\tIf you wish to validate again your data and metadata, " + finalError += "please use the --force option." + raise Exception(finalError) + + logger.info('{} genome samples successfully registered.'.format(str(len(aliasDict)))) + + return aliasDict \ No newline at end of file diff --git a/genome_upload.py b/genomeuploader/genome_upload.py similarity index 53% rename from genome_upload.py rename to genomeuploader/genome_upload.py index ee4bae5..ee7055d 100755 --- a/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -1,4 +1,18 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2017-2024 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import os import sys @@ -8,199 +22,25 @@ import json import pandas as pd from datetime import date, datetime as dt -from time import sleep + import xml.etree.ElementTree as ET import xml.dom.minidom as minidom import requests -sys.path.append(os.path.join(os.path.dirname(__file__), '..')) - -metagenomes = ["activated carbon metagenome", "activated sludge metagenome", - "aerosol metagenome", "air metagenome", "algae metagenome", "alkali sediment metagenome", - "amphibian metagenome", "anaerobic digester metagenome", "anchialine metagenome", - "annelid metagenome", "ant fungus garden metagenome", "ant metagenome", - "aquaculture metagenome", "aquatic eukaryotic metagenome", "aquatic metagenome", - "aquatic viral metagenome", "aquifer metagenome", "ballast water metagenome", - "bat gut metagenome", "bat metagenome", "beach sand metagenome", "beetle metagenome", - "bentonite metagenome", "bioanode metagenome", "biocathode metagenome", - "biofilm metagenome", "biofilter metagenome", "biofloc metagenome", - "biogas fermenter metagenome", "bioleaching metagenome", "bioreactor metagenome", - "bioreactor sludge metagenome", "bioretention column metagenome", "biosolids metagenome", - "bird metagenome", "blood metagenome", "bog metagenome", "book metagenome", - "bovine gut metagenome", "bovine metagenome", "brine metagenome", "canine metagenome", - "cave metagenome", "cetacean metagenome", "chemical production metagenome", - "chicken gut metagenome", "ciliate metagenome", "clay metagenome", "clinical metagenome", - "cloud metagenome", "coal metagenome", "cold seep metagenome", "cold spring metagenome", - "compost metagenome", "concrete metagenome", "coral metagenome", "coral reef metagenome", - "cow dung metagenome", "crab metagenome", "crude oil metagenome", - "Crustacea gut metagenome", "crustacean metagenome", "ctenophore metagenome", - "decomposition metagenome", "desalination cell metagenome", "dietary supplements metagenome", - "dinoflagellate metagenome", "drinking water metagenome", "dust metagenome", - "ear metagenome", "echinoderm metagenome", "egg metagenome", "electrolysis cell metagenome", - "endophyte metagenome", "epibiont metagenome", "estuary metagenome", "eukaryotic metagenome", - "eukaryotic plankton metagenome", "eye metagenome", "factory metagenome", "feces metagenome", - "feline metagenome", "fermentation metagenome", "fertilizer metagenome", - "fish gut metagenome", "fishing equipment metagenome", "fish metagenome", - "floral nectar metagenome", "flotsam metagenome", "flower metagenome", - "food contamination metagenome", "food fermentation metagenome", "food metagenome", - "food production metagenome", "fossil metagenome", "freshwater metagenome", - "freshwater sediment metagenome", "frog metagenome", "fuel tank metagenome", - "fungus metagenome", "gas well metagenome", "gill metagenome", "glacier lake metagenome", - "glacier metagenome", "gonad metagenome", "grain metagenome", "granuloma metagenome", - "groundwater metagenome", "gut metagenome", "halite metagenome", - "herbal medicine metagenome", "honeybee metagenome", "honey metagenome", "horse metagenome", - "hospital metagenome", "hot springs metagenome", "human bile metagenome", - "human blood metagenome", "human brain metagenome", "human eye metagenome", - "human feces metagenome", "human gut metagenome", "human hair metagenome", - "human lung metagenome", "human metagenome", "human milk metagenome", - "human nasopharyngeal metagenome", "human oral metagenome", - "human reproductive system metagenome", "human saliva metagenome", - "human semen metagenome", "human skeleton metagenome", "human skin metagenome", - "human sputum metagenome", "human tracheal metagenome", "human urinary tract metagenome", - "human vaginal metagenome", "human viral metagenome", "HVAC metagenome", - "hydrocarbon metagenome", "hydrothermal vent metagenome", "hydrozoan metagenome", - "hypersaline lake metagenome", "hyphosphere metagenome", "hypolithon metagenome", - "ice metagenome", "indoor metagenome", "industrial waste metagenome", - "insect gut metagenome", "insect metagenome", "insect nest metagenome", - "internal organ metagenome", "interstitial water metagenome", "invertebrate gut metagenome", - "invertebrate metagenome", "jellyfish metagenome", "karst metagenome", "koala metagenome", - "lagoon metagenome", "lake water metagenome", "landfill metagenome", "leaf litter metagenome", - "leaf metagenome", "lichen crust metagenome", "lichen metagenome", "liver metagenome", - "lung metagenome", "macroalgae metagenome", "mangrove metagenome", "manure metagenome", - "marine metagenome", "marine plankton metagenome", "marine sediment metagenome", - "marsh metagenome", "marsupial metagenome", "medical device metagenome", "metagenome", - "microbial eukaryotic metagenome", "microbial fuel cell metagenome", - "microbial mat metagenome", "microeukaryotic metagenome", "milk metagenome", - "mine drainage metagenome", "mine metagenome", "mine tailings metagenome", - "mite metagenome", "mixed culture metagenome", "mollusc metagenome", "money metagenome", - "moonmilk metagenome", "mosquito metagenome", "moss metagenome", "mouse gut metagenome", - "mouse metagenome", "mouse skin metagenome", "mud metagenome", "museum specimen metagenome", - "musk metagenome", "nematode metagenome", "neuston metagenome", "nutrient bag metagenome", - "oasis metagenome", "oil field metagenome", "oil metagenome", - "oil production facility metagenome", "oil sands metagenome", "oral metagenome", - "oral-nasopharyngeal metagenome", "oral viral metagenome", "outdoor metagenome", - "ovine metagenome", "oyster metagenome", "painting metagenome", "paper pulp metagenome", - "parasite metagenome", "parchment metagenome", "peat metagenome", "periphyton metagenome", - "permafrost metagenome", "photosynthetic picoeukaryotic metagenome", "phycosphere metagenome", - "phyllosphere metagenome", "phytotelma metagenome", "pig gut metagenome", "pig metagenome", - "pipeline metagenome", "pitcher plant inquiline metagenome", "placenta metagenome", - "plant metagenome", "plastic metagenome", "plastisphere metagenome", "pollen metagenome", - "pond metagenome", "poultry litter metagenome", "power plant metagenome", "primate metagenome", - "probiotic metagenome", "protist metagenome", "psyllid metagenome", "rat gut metagenome", - "rat metagenome", "reproductive system metagenome", "respiratory tract metagenome", - "retting metagenome", "rhizoplane metagenome", "rhizosphere metagenome", - "rice paddy metagenome", "riverine metagenome", "rock metagenome", - "rock porewater metagenome", "rodent metagenome", "root associated fungus metagenome", - "root metagenome", "runoff metagenome", "saline spring metagenome", "saltern metagenome", - "salt lake metagenome", "salt marsh metagenome", "salt mine metagenome", - "salt pan metagenome", "sand metagenome", "scorpion gut metagenome", - "sea anemone metagenome", "seagrass metagenome", "sea squirt metagenome", - "sea urchin metagenome", "seawater metagenome", "sediment metagenome", "seed metagenome", - "semen metagenome", "shale gas metagenome", "sheep gut metagenome", "sheep metagenome", - "shoot metagenome", "shrew metagenome", "shrimp gut metagenome", "silage metagenome", - "skin metagenome", "slag metagenome", "sludge metagenome", "snake metagenome", - "snow metagenome", "soda lake metagenome", "soda lime metagenome", "soil crust metagenome", - "soil metagenome", "solid waste metagenome", "spider metagenome", "sponge metagenome", - "starfish metagenome", "steel metagenome", "stomach metagenome", "stromatolite metagenome", - "subsurface metagenome", "surface metagenome", "symbiont metagenome", "synthetic metagenome", - "tannin metagenome", "tar pit metagenome", "termitarium metagenome", - "termite fungus garden metagenome", "termite gut metagenome", "termite metagenome", - "terrestrial metagenome", "tick metagenome", "tidal flat metagenome", "tin mine metagenome", - "tobacco metagenome", "tomb wall metagenome", "tree metagenome", - "upper respiratory tract metagenome", "urban metagenome", "urinary tract metagenome", - "urine metagenome", "urogenital metagenome", "vaginal metagenome", "viral metagenome", - "volcano metagenome", "wallaby gut metagenome", "wasp metagenome", "wastewater metagenome", - "wetland metagenome", "whale fall metagenome", "whole organism metagenome", "wine metagenome", - "Winogradsky column metagenome", "wood decay metagenome", "zebrafish metagenome"] -geographicLocations = ["Afghanistan", "Albania", "Algeria", "American Samoa", "Andorra", - "Angola", "Anguilla", "Antarctica", "Antigua and Barbuda", "Arctic Ocean", "Argentina", - "Armenia", "Aruba", "Ashmore and Cartier Islands", "Atlantic Ocean", "Australia", "Austria", - "Azerbaijan", "Bahamas", "Bahrain", "Baker Island", "Baltic Sea", "Bangladesh", - "Barbados", "Bassas da India", "Belarus", "Belgium", "Belize", "Benin", "Bermuda", - "Bhutan", "Bolivia", "Borneo", "Bosnia and Herzegovina", "Botswana", "Bouvet Island", - "Brazil", "British Virgin Islands", "Brunei", "Bulgaria", "Burkina Faso", "Burundi", - "Cambodia", "Cameroon", "Canada", "Cape Verde", "Cayman Islands", "Central African Republic", - "Chad", "Chile", "China", "Christmas Island", "Clipperton Island", "Cocos Islands", - "Colombia", "Comoros", "Cook Islands", "Coral Sea Islands", "Costa Rica", "Cote d'Ivoire", - "Croatia", "Cuba", "Curacao", "Cyprus", "Czech Republic", "Democratic Republic of the Congo", - "Denmark", "Djibouti", "Dominica", "Dominican Republic", "East Timor", "Ecuador", "Egypt", - "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Ethiopia", "Europa Island", - "Falkland Islands (Islas Malvinas)", "Faroe Islands", "Fiji", "Finland", "France", - "French Guiana", "French Polynesia", "French Southern and Antarctic Lands", "Gabon", - "Gambia", "Gaza Strip", "Georgia", "Germany", "Ghana", "Gibraltar", "Glorioso Islands", - "Greece", "Greenland", "GrENAda", "Guadeloupe", "Guam", "Guatemala", "Guernsey", "Guinea", - "Guinea-Bissau", "Guyana", "Haiti", "Heard Island and McDonald Islands", "Honduras", - "Hong Kong", "Howland Island", "Hungary", "Iceland", "India", "Indian Ocean", "Indonesia", - "Iran", "Iraq", "Ireland", "Isle of Man", "Israel", "Italy", "Jamaica", "Jan Mayen", "Japan", - "Jarvis Island", "Jersey", "Johnston Atoll", "Jordan", "Juan de Nova Island", "Kazakhstan", - "Kenya", "Kerguelen Archipelago", "Kingman Reef", "Kiribati", "Kosovo", "Kuwait", "Kyrgyzstan", - "Laos", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania", - "Luxembourg", "Macau", "Macedonia", "Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", - "Malta", "Marshall Islands", "Martinique", "Mauritania", "Mauritius", "Mayotte", - "Mediterranean Sea", "Mexico", "Micronesia", "Midway Islands", "Moldova", "Monaco", - "Mongolia", "Montenegro", "Montserrat", "Morocco", "Mozambique", "Myanmar", "Namibia", - "Nauru", "Navassa Island", "Nepal", "Netherlands", "New Caledonia", "New Zealand", - "Nicaragua", "Niger", "Nigeria", "Niue", "Norfolk Island", "Northern Mariana Islands", - "North Korea", "North Sea", "Norway", "not applicable", "not collected", "not provided", - "Oman", "Pacific Ocean", "Pakistan", "Palau", "Palmyra Atoll", "Panama", "Papua New Guinea", - "Paracel Islands", "Paraguay", "Peru", "Philippines", "Pitcairn Islands", "Poland", - "Portugal", "Puerto Rico", "Qatar", "Republic of the Congo", "restricted access", "Reunion", - "Romania", "Ross Sea", "Russia", "Rwanda", "Saint HelENA", "Saint Kitts and Nevis", - "Saint Lucia", "Saint Pierre and Miquelon", "Saint Vincent and the GrENAdines", "Samoa", - "San Marino", "Sao Tome and Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles", - "Sierra Leone", "Singapore", "Sint Maarten", "Slovakia", "Slovenia", "Solomon Islands", - "Somalia", "South Africa", "Southern Ocean", "South Georgia and the South Sandwich Islands", - "South Korea", "Spain", "Spratly Islands", "Sri Lanka", "Sudan", "Suriname", "Svalbard", - "Swaziland", "Sweden", "Switzerland", "Syria", "Taiwan", "Tajikistan", "Tanzania", - "Tasman Sea", "Thailand", "Togo", "Tokelau", "Tonga", "Trinidad and Tobago", - "Tromelin Island", "Tunisia", "Turkey", "Turkmenistan", "Turks and Caicos Islands", - "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom", "Uruguay", - "USA", "Uzbekistan", "Vanuatu", "Venezuela", "Viet Nam", "Virgin Islands", "Wake Island", - "Wallis and Futuna", "West Bank", "Western Sahara", "Yemen", "Zambia", "Zimbabwe"] - -RETRY_COUNT = 5 -HQ = ("Multiple fragments where gaps span repetitive regions. Presence of the " - "23S, 16S, and 5S rRNA genes and at least 18 tRNAs.") -MQ = ("Many fragments with little to no review of assembly other than reporting " - "of standard assembly statistics.") +from .ena import ENA -class NoDataException(ValueError): - pass +from .constants import METAGENOMES, GEOGRAPHIC_LOCATIONS, MQ, HQ -def parse_args(argv): - parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter, - description="Allows to create xmls and manifest files for genome upload to ENA. " + - "--xmls and --manifests are needed to determine the action the script " + - "should perform. The use of more than one option is encouraged. To spare time, " + - "-xmls and -manifests should be called only if respective xml or manifest files " + - "do not already exist.") - - parser.add_argument('-u', '--upload_study', type=str, help="Study accession for genomes upload") - parser.add_argument('--genome_info', type=str, required=True, help="Genomes metadata file") +logging.basicConfig(level=logging.DEBUG) - genomeType = parser.add_mutually_exclusive_group(required=True) - genomeType.add_argument('-m', '--mags', action='store_true', help="Select for MAG upload") - genomeType.add_argument('-b', '--bins', action='store_true', help="Select for bin upload") - - parser.add_argument('--out', type=str, help="Output folder. Default: working directory") - parser.add_argument('--force', action='store_true', help="Forces reset of sample xml's backups") - parser.add_argument('--live', action='store_true', help="Uploads on ENA. Omitting this " + - "option allows to validate samples beforehand") - parser.add_argument('--tpa', action='store_true', help="Select if uploading TPA-generated genomes") - - parser.add_argument('--webin', required=True, help="Webin id") - parser.add_argument('--password', required=True, help="Webin password") - parser.add_argument('--centre_name', required=True, help="Name of the centre uploading genomes") +logger = logging.getLogger(__name__) - args = parser.parse_args(argv) +ena = ENA() + +class NoDataException(ValueError): + pass - if not args.upload_study: - raise ValueError("No project selected for genome upload [-u, --upload_study].") - - if not os.path.exists(args.genome_info): - raise FileNotFoundError('Genome metadata file "{}" does not exist'.format(args.genome_info)) - return args ''' Input table: expects the following parameters: @@ -223,7 +63,7 @@ def parse_args(argv): genome_path: path to genome to upload ''' def read_and_cleanse_metadata_tsv(inputFile, genomeType, live): - print('\tRetrieving info for genomes to submit...') + logger.info('Retrieving info for genomes to submit...') binMandatoryFields = ["genome_name", "accessions", "assembly_software", "binning_software", @@ -292,7 +132,7 @@ def read_and_cleanse_metadata_tsv(inputFile, genomeType, live): # are provided metagenomes part of the accepted metagenome list? if False in metadata.apply(lambda row: - True if row["metagenome"] in metagenomes + True if row["metagenome"] in METAGENOMES else False, axis=1).unique(): raise ValueError("Metagenomes associated with each genome need to belong to ENA's " + "approved metagenomes list.") @@ -332,9 +172,7 @@ def round_stats(stats): return newStat def compute_MAG_quality(completeness, contamination, RNApresence): - RNApresent = False - if str(RNApresence).lower() in ["true", "yes", "y"]: - RNApresent = True + RNApresent = str(RNApresence).lower() in ["true", "yes", "y"] quality = MQ if completeness >= 90 and contamination <= 5 and RNApresent: quality = HQ @@ -371,12 +209,12 @@ def extract_tax_info(taxInfo): elif finalKingdom == "Eukaryota": scientificName = "uncultured eukaryote" elif digitAnnotation: - scientificName = query_taxid(scientificName) + scientificName = ena.query_taxid(scientificName) elif "__" in scientificName: scientificName = scientificName.split("__")[1] else: raise ValueError("Unrecognised taxonomy format: " + scientificName) - submittable, taxid, rank = query_scientific_name(scientificName, searchRank=True) + submittable, taxid, rank = ena.query_scientific_name(scientificName, searchRank=True) if not submittable: if finalKingdom == "Archaea": @@ -389,51 +227,6 @@ def extract_tax_info(taxInfo): return taxid, scientificName -def query_taxid(taxid): - url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{}".format(taxid) - response = requests.get(url) - - try: - # Will raise exception if response status code is non-200 - response.raise_for_status() - except requests.exceptions.HTTPError as e: - print("Request failed {} with error {}".format(url, e)) - return False - - res = json.loads(response.text) - - return res.get("scientificName", "") - -def query_scientific_name(scientificName, searchRank=False): - url = "https://www.ebi.ac.uk/ena/taxonomy/rest/scientific-name/{}".format(scientificName) - response = requests.get(url) - - try: - # Will raise exception if response status code is non-200 - response.raise_for_status() - except requests.exceptions.HTTPError as e: - if searchRank: - return False, "", "" - else: - return False, "" - - try: - res = json.loads(response.text)[0] - except IndexError: - if searchRank: - return False, "", "" - else: - return False, "" - - submittable = res.get("submittable", "").lower() == "true" - taxid = res.get("taxId", "") - rank = res.get("rank", "") - - if searchRank: - return submittable, taxid, rank - else: - return submittable, taxid - def extract_Eukaryota_info(name, rank): nonSubmittable = (False, "", 0) @@ -443,21 +236,21 @@ def extract_Eukaryota_info(name, rank): if rank == "super kingdom": name = "uncultured eukaryote" - submittable, taxid = query_scientific_name(name) + submittable, taxid = ena.query_scientific_name(name) return submittable, name, taxid else: name = name.capitalize() + " sp." - submittable, taxid = query_scientific_name(name) + submittable, taxid = ena.query_scientific_name(name) if submittable: return submittable, name, taxid else: name = "uncultured " + name - submittable, taxid = query_scientific_name(name) + submittable, taxid = ena.query_scientific_name(name) if submittable: return submittable, name, taxid else: name = name.replace(" sp.", '') - submittable, taxid = query_scientific_name(name) + submittable, taxid = ena.query_scientific_name(name) if submittable: return submittable, name, taxid else: @@ -473,14 +266,14 @@ def extract_Bacteria_info(name, rank): elif rank == "genus": name = "uncultured {} sp.".format(name) - submittable, taxid, rank = query_scientific_name(name, searchRank=True) + submittable, taxid, rank = ena.query_scientific_name(name, searchRank=True) if not submittable: if rank in ["species", "genus"] and name.lower().endswith("bacteria"): name = "uncultured {}".format(name.lower().replace("bacteria", "bacterium")) elif rank == "family": if name.lower() == "deltaproteobacteria": name = "uncultured delta proteobacterium" - submittable, taxid = query_scientific_name(name) + submittable, taxid = ena.query_scientific_name(name) return submittable, name, taxid @@ -501,14 +294,14 @@ def extract_Archaea_info(name, rank): elif rank == "genus": name = "uncultured {} sp.".format(name) - submittable, taxid, rank = query_scientific_name(name, searchRank=True) + submittable, taxid, rank = ena.query_scientific_name(name, searchRank=True) if not submittable: if "Candidatus" in name: if rank == "phylum": name = name.replace("Candidatus ", '') elif rank == "family": name = name.replace("uncultured ", '') - submittable, taxid = query_scientific_name(name) + submittable, taxid = ena.query_scientific_name(name) return submittable, name, taxid @@ -546,178 +339,8 @@ def extract_genomes_info(inputFile, genomeType, live): return genomeInfo -# ------------------- ENA API HANDLER ------------------- -# TODO: organise this into a class - -RUN_DEFAULT_FIELDS = 'study_accession,secondary_study_accession,instrument_model,' \ - 'run_accession,sample_accession' - -ASSEMBLY_DEFAULT_FIELDS = 'sample_accession' - -SAMPLE_DEFAULT_FIELDS = 'sample_accession,secondary_sample_accession,' \ - 'collection_date,country,location' - -STUDY_DEFAULT_FIELDS = 'study_accession,secondary_study_accession,description,study_title' - -def get_default_params(): - return { - 'format': 'json', - 'includeMetagenomes': True, - 'dataPortal': 'ena' - } - -def post_request(data, webin, password): - url = "https://www.ebi.ac.uk/ena/portal/api/search" - auth = (webin, password) - default_connection_headers = { - "Content-Type": "application/x-www-form-urlencoded", - "Accept": "*/*" - } - response = requests.post(url, data=data, auth=auth, headers=default_connection_headers) - - return response - -def get_run(run_accession, webin, password, attempt=0, search_params=None): - data = get_default_params() - data['result'] = 'read_run' - data['fields'] = RUN_DEFAULT_FIELDS - data['query'] = 'run_accession=\"{}\"'.format(run_accession) - - if search_params: - data.update(search_params) - - response = post_request(data, webin, password) - - if str(response.status_code)[0] != '2' and attempt > 2: - raise ValueError("Could not retrieve run with accession {}, returned " - "message: {}".format(run_accession, response.text)) - elif response.status_code == 204: - if attempt < 2: - attempt += 1 - sleep(1) - return get_run(run_accession, webin, password, attempt) - else: - raise ValueError("Could not find run {} in ENA after {}" - " attempts".format(run_accession, RETRY_COUNT)) - try: - run = json.loads(response.text)[0] - except (IndexError, TypeError, ValueError): - raise ValueError("Could not find run {} in ENA.".format(run_accession)) - except: - raise Exception("Could not query ENA API: {}".format(response.text)) - - return run - -def get_run_from_assembly(assembly_name): - manifestXml = minidom.parseString(requests.get("https://www.ebi.ac.uk" + - "/ena/browser/api/xml/" + assembly_name).text) - - run_ref = manifestXml.getElementsByTagName("RUN_REF") - run = run_ref[0].attributes["accession"].value - - return run - -def get_study(webin, password, primary_accession=None, secondary_accession=None): - data = get_default_params() - data['result'] = 'read_study' - data['fields'] = STUDY_DEFAULT_FIELDS - - if primary_accession and not secondary_accession: - data['query'] = 'study_accession="{}"'.format(primary_accession) - elif not primary_accession and secondary_accession: - data['query'] = 'secondary_study_accession="{}"'.format(secondary_accession) - else: - data['query'] = 'study_accession="{}" AND secondary_study_accession="{}"' \ - .format(primary_accession, secondary_accession) - - query_params = [] - for result_type in ['study', 'read_study', 'analysis_study']: - for data_portal in ['ena', 'metagenome']: - param = data.copy() - param['result'] = result_type - param['dataPortal'] = data_portal - if result_type == 'study': - if 'description' in param['fields']: - param['fields'] = param['fields'].replace('description', 'study_description') - query_params.append(param) - - for param in query_params: - try: - response = post_request(data, webin, password) - if response.status_code == 204: - raise NoDataException() - try: - study = json.loads(response.text)[0] - except (IndexError, TypeError, ValueError, KeyError) as e: - raise e - if data['result'] == 'study': - if 'study_description' in study: - study['description'] = study.pop('study_description') - return study - except NoDataException: - print("No info found to fetch study with params {}".format(param)) - pass - except (IndexError, TypeError, ValueError, KeyError): - print("Failed to fetch study with params {}, returned error: {}".format(param, response.text)) - - raise ValueError('Could not find study {} {} in ENA.'.format(primary_accession, secondary_accession)) - -def get_study_runs(study_acc, webin, password, fields=None, search_params=None): - data = get_default_params() - data['result'] = 'read_run' - data['fields'] = fields or RUN_DEFAULT_FIELDS - data['query'] = '(study_accession=\"{}\" OR secondary_study_accession=\"{}\")'.format(study_acc, study_acc) - - if search_params: - data.update(search_params) - - response = post_request(data, webin, password) - - if str(response.status_code)[0] != '2': - raise ValueError("Could not retrieve runs for study %s.", study_acc) - elif response.status_code == 204: - return [] - - try: - runs = json.loads(response.text) - except: - raise ValueError("Query against ENA API did not work. Returned " - "message: {}".format(response.text)) - - return runs - -def get_sample(sample_accession, webin, password, fields=None, search_params=None, attempt=0): - data = get_default_params() - data['result'] = 'sample' - data['fields'] = fields or SAMPLE_DEFAULT_FIELDS - data['query'] = ('(sample_accession=\"{acc}\" OR secondary_sample_accession' - '=\"{acc}\") ').format(acc=sample_accession) - - if search_params: - data.update(search_params) - - response = post_request(data, webin, password) - - if response.status_code == 200: - return json.loads(response.text)[0] - else: - if str(response.status_code)[0] != '2': - raise ValueError("Could not retrieve sample with accession {}. " - "Returned message: {}".format(sample_accession, response.text)) - elif response.status_code == 204: - if attempt < 2: - new_params = {'dataPortal': 'metagenome' if data['dataPortal'] == 'ena' else 'ena'} - attempt += 1 - return get_sample(sample_accession, webin, password, fields=fields, - search_params=new_params, attempt=attempt) - else: - raise ValueError("Could not find sample {} in ENA after " - "{} attempts.".format(sample_accession, RETRY_COUNT)) - -# ------------------------------------------------------- - def extract_ENA_info(genomeInfo, uploadDir, webin, password): - print('\tRetrieving project and run info from ENA (this might take a while)...') + logger.info('Retrieving project and run info from ENA (this might take a while)...') # retrieving metadata from runs (and runs from assembly accessions if provided) allRuns = [] @@ -725,13 +348,13 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): if genomeInfo[g]["accessionType"] == "assembly": derivedRuns = [] for acc in genomeInfo[g]["accessions"]: - derivedRuns.append(get_run_from_assembly(acc)) + derivedRuns.append(ena.get_run_from_assembly(acc)) genomeInfo[g]["accessions"] = derivedRuns allRuns.extend(genomeInfo[g]["accessions"]) runsSet, studySet, samplesDict, tempDict = set(allRuns), set(), {}, {} for r in runsSet: - run_info = get_run(r, webin, password) + run_info = ena.get_run(r, webin, password) studySet.add(run_info["secondary_study_accession"]) samplesDict[r] = run_info["sample_accession"] @@ -747,15 +370,15 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): try: backupDict = json.load(file) tempDict = dict(backupDict) - print("\tA backup file for ENA sample metadata has been found.") + logger.info("A backup file for ENA sample metadata has been found.") except json.decoder.JSONDecodeError: backupDict = {} for s in studySet: - studyInfo = get_study(webin, password, "", s) + studyInfo = ena.get_study(webin, password, "", s) projectDescription = studyInfo["description"] - ENA_info = get_study_runs(s, webin, password) + ENA_info = ena.get_study_runs(s, webin, password) if ENA_info == []: raise IOError("No runs found on ENA for project {}.".format(s)) for run, item in enumerate(ENA_info): @@ -763,7 +386,7 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): if runAccession not in backupDict: if runAccession in runsSet: sampleAccession = ENA_info[run]["sample_accession"] - sampleInfo = get_sample(sampleAccession, webin, password) + sampleInfo = ena.get_sample(sampleAccession, webin, password) location = sampleInfo["location"] if 'N' in location: @@ -782,7 +405,7 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): longitude = str(float(longitude.split('E')[0].strip())) country = sampleInfo["country"].split(':')[0] - if not country in geographicLocations: + if not country in GEOGRAPHIC_LOCATIONS: country = "not provided" collectionDate = sampleInfo["collection_date"] @@ -830,9 +453,10 @@ def combine_ENA_info(genomeInfo, ENADict): latitList.append(ENADict[run]["latitude"]) if multipleElementSet(studyList): - print("The co-assembly your MAG has been generated from comes from " + + logger.error("The co-assembly your MAG has been generated from comes from " + "different studies.") sys.exit(1) + genomeInfo[g]["study"] = studyList[0] genomeInfo[g]["description"] = descriptionList[0] @@ -878,52 +502,7 @@ def combine_ENA_info(genomeInfo, ENADict): genomeInfo[g]["accessions"] = ','.join(genomeInfo[g]["accessions"]) -def handle_genomes_registration(sample_xml, submission_xml, webin, password, live=False): - liveSub, mode = "", "live" - if not live: - liveSub = "dev" - mode = "test" - url = "https://www{}.ebi.ac.uk/ena/submit/drop-box/submit/".format(liveSub) - - print('\tRegistering sample xml in {} mode.'.format(mode)) - - f = { - 'SUBMISSION': open(submission_xml, 'r'), - 'SAMPLE': open(sample_xml, 'r') - } - submissionResponse = requests.post(url, files = f, auth = (webin, password)) - - if submissionResponse.status_code != 200: - if str(submissionResponse.status_code).startswith('5'): - raise Exception("Genomes could not be submitted to ENA as the server " + - "does not respond. Please again try later.") - else: - raise Exception("Genomes could not be submitted to ENA. HTTP response: " + - submissionResponse.reason) - - receiptXml = minidom.parseString((submissionResponse.content).decode("utf-8")) - receipt = receiptXml.getElementsByTagName("RECEIPT") - success = receipt[0].attributes["success"].value - if success == "true": - aliasDict = {} - samples = receiptXml.getElementsByTagName("SAMPLE") - for s in samples: - sraAcc = s.attributes["accession"].value - alias = s.attributes["alias"].value - aliasDict[alias] = sraAcc - elif success == "false": - errors = receiptXml.getElementsByTagName("ERROR") - finalError = "\tSome genomes could not be submitted to ENA. Please, check the errors below." - for error in errors: - finalError += "\n\t" + error.firstChild.data - finalError += "\n\tIf you wish to validate again your data and metadata, " - finalError += "please use the --force option." - raise Exception(finalError) - - print('\t{} genome samples successfully registered.'.format(str(len(aliasDict)))) - - return aliasDict def getAccessions(accessionsFile): accessionDict = {} @@ -974,7 +553,7 @@ def get_study_from_xml(sample): return study def recover_info_from_xml(genomeDict, sample_xml, live_mode): - print("Retrieving data for genome submission...") + logger.info("Retrieving data for genome submission...") # extract list of genomes (samples) to be registered xml_structure = minidom.parse(sample_xml) @@ -1177,7 +756,7 @@ def generate_genome_manifest(genomeInfo, study, manifestsRoot, aliasToSample, ge ('RUN_REF', genomeInfo["accessions"]), ('FASTA', os.path.abspath(genomeInfo["genome_path"])) ) - print("Writing manifest file (.manifest) for {}.".format(genomeInfo["alias"])) + logger.info("Writing manifest file (.manifest) for {}.".format(genomeInfo["alias"])) with open(manifest_path, "w") as outfile: for (k, v) in values: manifest = f'{k}\t{v}\n' @@ -1185,73 +764,67 @@ def generate_genome_manifest(genomeInfo, study, manifestsRoot, aliasToSample, ge if tpa: outfile.write("TPA\ttrue\n") -def file_generator(): +def main(): ENA_uploader = GenomeUpload() - - uploadDir = ENA_uploader.upload_dir - live = ENA_uploader.live - tpa = ENA_uploader.tpa - webinUser, webinPassword = ENA_uploader.username, ENA_uploader.password - genomeType, centre_name = ENA_uploader.genomeType, ENA_uploader.centre_name - if not live: - print("Warning: genome submission is not in live mode, " + + if not ENA_uploader.live: + logger.warn("Warning: genome submission is not in live mode, " + "files will be validated, but not uploaded.") xmlGenomeFile, xmlSubFile = "genome_samples.xml", "submission.xml" - samples_xml = os.path.join(uploadDir, xmlGenomeFile) - submissionXmlPath = os.path.join(uploadDir, xmlSubFile) + samples_xml = os.path.join(ENA_uploader.upload_dir, xmlGenomeFile) + submissionXmlPath = os.path.join(ENA_uploader.upload_dir, xmlSubFile) submission_xml = submissionXmlPath genomes, manifestInfo = {}, {} # submission xml existence if not os.path.exists(submissionXmlPath): - submission_xml = write_submission_xml(uploadDir, centre_name, False) + submission_xml = write_submission_xml(ENA_uploader.upload_dir, ENA_uploader.centre_name, False) # sample xml generation or recovery genomes = ENA_uploader.create_genome_dictionary(samples_xml) # manifests creation - manifestDir = os.path.join(uploadDir, "manifests") + manifestDir = os.path.join(ENA_uploader.upload_dir, "manifests") os.makedirs(manifestDir, exist_ok=True) accessionsgen = "registered_MAGs.tsv" - if genomeType == "bins": + if ENA_uploader.genomeType == "bins": accessionsgen = accessionsgen.replace("MAG", "bin") - if not live: + if not ENA_uploader.live: accessionsgen = accessionsgen.replace(".tsv", "_test.tsv") - accessionsFile = os.path.join(uploadDir, accessionsgen) + accessionsFile = os.path.join(ENA_uploader.upload_dir, accessionsgen) save = False writeMode = 'a' if os.path.exists(accessionsFile): - if not live: + if not ENA_uploader.live: save = True if ENA_uploader.force: writeMode = 'w' if not save: - print("Genome samples already registered, reading ERS accessions...") + logger.info("Genome samples already registered, reading ERS accessions...") aliasToNewSampleAccession = getAccessions(accessionsFile) else: save = True if save: - print("Registering genome samples XMLs...") - aliasToNewSampleAccession = handle_genomes_registration(samples_xml, - submission_xml, webinUser, webinPassword, live) + logger.info("Registering genome samples XMLs...") + aliasToNewSampleAccession = ena.handle_genomes_registration(samples_xml, + submission_xml, ENA_uploader.username, ENA_uploader.password, ENA_uploader.live) saveAccessions(aliasToNewSampleAccession, accessionsFile, writeMode) - print("Generating manifest files...") + logger.info("Generating manifest files...") manifestInfo = compute_manifests(genomes) for m in manifestInfo: generate_genome_manifest(manifestInfo[m], ENA_uploader.upStudy, - manifestDir, aliasToNewSampleAccession, genomeType, tpa) + manifestDir, aliasToNewSampleAccession, ENA_uploader.genomeType, ENA_uploader.tpa) class GenomeUpload: def __init__(self, argv=sys.argv[1:]): - self.args = parse_args(argv) + self.args = self.parse_args(argv) self.upStudy = self.args.upload_study self.genomeMetadata = self.args.genome_info self.genomeType = "bins" if self.args.bins else "MAGs" @@ -1265,6 +838,41 @@ def __init__(self, argv=sys.argv[1:]): workDir = self.args.out if self.args.out else os.getcwd() self.upload_dir = self.generate_genomes_upload_dir(workDir, self.genomeType) + def parse_args(argv): + parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter, + description="Allows to create xmls and manifest files for genome upload to ENA. " + + "--xmls and --manifests are needed to determine the action the script " + + "should perform. The use of more than one option is encouraged. To spare time, " + + "-xmls and -manifests should be called only if respective xml or manifest files " + + "do not already exist.") + + parser.add_argument('-u', '--upload_study', type=str, help="Study accession for genomes upload") + parser.add_argument('--genome_info', type=str, required=True, help="Genomes metadata file") + + genomeType = parser.add_mutually_exclusive_group(required=True) + genomeType.add_argument('-m', '--mags', action='store_true', help="Select for MAG upload") + genomeType.add_argument('-b', '--bins', action='store_true', help="Select for bin upload") + + parser.add_argument('--out', type=str, help="Output folder. Default: working directory") + parser.add_argument('--force', action='store_true', help="Forces reset of sample xml's backups") + parser.add_argument('--live', action='store_true', help="Uploads on ENA. Omitting this " + + "option allows to validate samples beforehand") + parser.add_argument('--tpa', action='store_true', help="Select if uploading TPA-generated genomes") + + parser.add_argument('--webin', required=True, help="Webin id") + parser.add_argument('--password', required=True, help="Webin password") + parser.add_argument('--centre_name', required=True, help="Name of the centre uploading genomes") + + args = parser.parse_args(argv) + + if not args.upload_study: + raise ValueError("No project selected for genome upload [-u, --upload_study].") + + if not os.path.exists(args.genome_info): + raise FileNotFoundError('Genome metadata file "{}" does not exist'.format(args.genome_info)) + + return args + def generate_genomes_upload_dir(self, dir, genomeType): uploadName = "MAG_upload" if genomeType == "bins": @@ -1274,20 +882,20 @@ def generate_genomes_upload_dir(self, dir, genomeType): return upload_dir def create_genome_dictionary(self, samples_xml): - print('Retrieving data for MAG submission...') + logger.info('Retrieving data for MAG submission...') genomeInfo = extract_genomes_info(self.genomeMetadata, self.genomeType, self.live) if not os.path.exists(samples_xml) or self.force: extract_ENA_info(genomeInfo, self.upload_dir, self.username, self.password) - print("\tWriting genome registration XML...") + logger.info("Writing genome registration XML...") write_genomes_xml(genomeInfo, samples_xml, self.genomeType, self.centre_name, self.tpa) - print("\tAll files have been written to " + self.upload_dir) + logger.info("All files have been written to " + self.upload_dir) else: recover_info_from_xml(genomeInfo, samples_xml, self.live) return genomeInfo if __name__ == "__main__": - file_generator() - print('Completed') + main() + logger.info('Completed') diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..28eefce --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,86 @@ +[project] +name = "genomeuploader" +readme = "README.md" +authors = [ + {name = "MGnify team", email = "metagenomics-help@ebi.ac.uk"}, +] +license = {text = "Apache Software License 2.0"} +keywords = ["bioinformatics", "tool", "metagenomics"] +dynamic = ["version"] +description = "Python script to upload bins and MAGs in fasta format to ENA (European Nucleotide Archive). This script generates xmls and manifests necessary for submission with webin-cli." +requires-python = ">=3.8" +classifiers = [ + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", +] + +dependencies = [ + "requests==2.26.0", + "pandas==1.4.1" +] + +[project.optional-dependencies] +dev = [ + "pre-commit==3.3.3", + "black==23.7.0", + "ruff==v0.0.286", + "isort==5.12.0", + "bump-my-version==0.9.2", +] +test = [ + "pytest==7.1.2", + "pytest-cov==3.0.0", +] + +[build-system] +requires = ["setuptools>=61.0.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools] +packages = ["genomeuploader"] + +[tool.setuptools.dynamic] +version = {attr = "genomeuploader.__version__"} + +[project.scripts] +genome_upload = "genomeuploader.genome_upload:main" + +[tool.ruff] +ignore = [ + "RUF001", # ruff-specific rules ambiguous-unicode-character-string + "S101", # flake8-bandit assert + "S308", # flake8-bandit suspicious-mark-safe-usage + "E501", # pycodestyle line-too-long +] +line-length = 140 +src = ["fetchtool", "tests"] +target-version = "py38" + +[tool.ruff.flake8-pytest-style] +fixture-parentheses = false +mark-parentheses = false + +[tool.ruff.isort] +forced-separate = ["conftest"] +force-single-line = true + +[tool.black] +line-length = 140 +target-version = ["py38"] + +[tool.isort] +profile = "black" + +[tool.bumpversion] +current_version = "0.9.0" +commit = true +tag = true + +[[tool.bumpversion.files]] +filename = "genomeuploader/__init__.py" diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..0e86f06 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +python_files = tests/*.py +pythonpath = . \ No newline at end of file diff --git a/tests/test_dummy.py b/tests/test_dummy.py new file mode 100644 index 0000000..9095527 --- /dev/null +++ b/tests/test_dummy.py @@ -0,0 +1,5 @@ + +class TestDummy: + + def test_dummy(self): + assert 1 == 1 \ No newline at end of file From 56fa1beb31157cf15620f8588e0682acdff6e23d Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Thu, 25 Jan 2024 13:50:48 +0000 Subject: [PATCH 02/26] Fix arg parser and move exception to ena.py --- genomeuploader/ena.py | 3 +++ genomeuploader/genome_upload.py | 6 +----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py index 3542351..5fd6f0f 100644 --- a/genomeuploader/ena.py +++ b/genomeuploader/ena.py @@ -28,6 +28,9 @@ logger = logging.getLogger(__name__) +class NoDataException(ValueError): + pass + RUN_DEFAULT_FIELDS = ','.join([ 'study_accession', diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index ee7055d..6d0627c 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -25,7 +25,6 @@ import xml.etree.ElementTree as ET import xml.dom.minidom as minidom -import requests from .ena import ENA @@ -37,9 +36,6 @@ ena = ENA() -class NoDataException(ValueError): - pass - ''' @@ -838,7 +834,7 @@ def __init__(self, argv=sys.argv[1:]): workDir = self.args.out if self.args.out else os.getcwd() self.upload_dir = self.generate_genomes_upload_dir(workDir, self.genomeType) - def parse_args(argv): + def parse_args(self, argv): parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter, description="Allows to create xmls and manifest files for genome upload to ENA. " + "--xmls and --manifests are needed to determine the action the script " + From 22b5cb59d65dc62bc7dbb4b54d78712a1f42fff7 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Fri, 2 Feb 2024 13:30:39 +0000 Subject: [PATCH 03/26] ENV loading prototype --- .env.example | 2 ++ genomeuploader/genome_upload.py | 41 ++++++++++++++++++++++++++++----- pyproject.toml | 3 ++- 3 files changed, 39 insertions(+), 7 deletions(-) create mode 100644 .env.example diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..41fba3f --- /dev/null +++ b/.env.example @@ -0,0 +1,2 @@ +ENA_WEBIN="" +ENA_WEBIN_PASSWORD="" \ No newline at end of file diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 6d0627c..36e918d 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -20,9 +20,12 @@ import argparse import re import json -import pandas as pd +from pathlib import Path from datetime import date, datetime as dt +import pandas as pd +from dotenv import load_dotenv + import xml.etree.ElementTree as ET import xml.dom.minidom as minidom @@ -825,8 +828,33 @@ def __init__(self, argv=sys.argv[1:]): self.genomeMetadata = self.args.genome_info self.genomeType = "bins" if self.args.bins else "MAGs" self.live = True if self.args.live else False - self.username = self.args.webin - self.password = self.args.password + + if self.args.webin and self.args.password: + self.username = self.args.webin + self.password = self.args.password + else: + # Config file # + user_config = Path.home() / ".genome_uploader.config" + if user_config.exists(): + logger.debug("Loading the env variables from {user_config}") + load_dotenv(str(user_config)) + else: + cwd_config = Path.cwd() / ".genome_uploader.config" + if not cwd_config.exists(): + logger.debug(f"Loading the variables from the current directory {Path.cwd()}.genome_uploader.config") + load_dotenv(str(cwd_config)) + else: + logger.debug("Trying to load env variables from the .env file") + # from a local .env file + load_dotenv() + + self.username = os.getenv("ENA_WEBIN") + self.password = os.getenv("ENA_WEBIN_PASSWORD") + + if not self.username or not self.password: + logger.error("ENA Webin username or password are empty") + sys.exit(1) + self.tpa = True if self.args.tpa else False self.centre_name = self.args.centre_name self.force = True if self.args.force else False @@ -855,9 +883,10 @@ def parse_args(self, argv): "option allows to validate samples beforehand") parser.add_argument('--tpa', action='store_true', help="Select if uploading TPA-generated genomes") - parser.add_argument('--webin', required=True, help="Webin id") - parser.add_argument('--password', required=True, help="Webin password") - parser.add_argument('--centre_name', required=True, help="Name of the centre uploading genomes") + # Users can provide their credentials and centre name manually or using a config file + parser.add_argument('--webin', required=False, help="Webin id") + parser.add_argument('--password', required=False, help="Webin password") + parser.add_argument('--centre_name', required=False, help="Name of the centre uploading genomes") args = parser.parse_args(argv) diff --git a/pyproject.toml b/pyproject.toml index 28eefce..bf5256f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,8 @@ classifiers = [ dependencies = [ "requests==2.26.0", - "pandas==1.4.1" + "pandas==1.4.1", + "python-dotenv==1.0.1" ] [project.optional-dependencies] From d298c7eddfd4731ad3d042ae38663004f3e3286c Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Fri, 2 Feb 2024 13:33:06 +0000 Subject: [PATCH 04/26] Invert confitional in env loading --- genomeuploader/genome_upload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 36e918d..90d29d3 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -840,7 +840,7 @@ def __init__(self, argv=sys.argv[1:]): load_dotenv(str(user_config)) else: cwd_config = Path.cwd() / ".genome_uploader.config" - if not cwd_config.exists(): + if cwd_config.exists(): logger.debug(f"Loading the variables from the current directory {Path.cwd()}.genome_uploader.config") load_dotenv(str(cwd_config)) else: From 4f5388b45174fb1378a39f70dbf09315bee517b9 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Fri, 2 Feb 2024 13:33:48 +0000 Subject: [PATCH 05/26] Missing string interpolation f prefix --- genomeuploader/genome_upload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 90d29d3..2affc1a 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -836,7 +836,7 @@ def __init__(self, argv=sys.argv[1:]): # Config file # user_config = Path.home() / ".genome_uploader.config" if user_config.exists(): - logger.debug("Loading the env variables from {user_config}") + logger.debug(f"Loading the env variables from {user_config}") load_dotenv(str(user_config)) else: cwd_config = Path.cwd() / ".genome_uploader.config" From 39804543ee96b912528ccfb40e6ed429fb04a948 Mon Sep 17 00:00:00 2001 From: Ge94 Date: Fri, 2 Feb 2024 16:20:20 +0000 Subject: [PATCH 06/26] Add python-dotenv to requirements --- requirements.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.yml b/requirements.yml index b3b2af6..55f38b1 100755 --- a/requirements.yml +++ b/requirements.yml @@ -1,9 +1,11 @@ name: genomeUploader channels: - bioconda + - conda-forge - defaults dependencies: - requests=2.26.0 - pandas=1.4.1 - ena-webin-cli + - python-dotenv=1.0.1 From ff723f30b4b8314d496ef84cf9f794714dea3852 Mon Sep 17 00:00:00 2001 From: Ge94 Date: Sat, 3 Feb 2024 13:51:37 +0000 Subject: [PATCH 07/26] ENA API refactor --- genomeuploader/ena.py | 75 +++++++++++++-------------------- genomeuploader/genome_upload.py | 21 +++------ 2 files changed, 36 insertions(+), 60 deletions(-) diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py index 5fd6f0f..51ab45c 100644 --- a/genomeuploader/ena.py +++ b/genomeuploader/ena.py @@ -23,7 +23,7 @@ import xml.dom.minidom as minidom -logging.basicConfig(level=logging.DEBUG) +logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -53,7 +53,7 @@ class NoDataException(ValueError): STUDY_DEFAULT_FIELDS = ','.join([ 'study_accession', 'secondary_study_accession', - 'description', + 'study_description', 'study_title' ]) @@ -119,50 +119,32 @@ def get_run_from_assembly(self, assembly_name): return run - def get_study(self, webin, password, primary_accession=None, secondary_accession=None): + def get_study(self, webin, password, study_accession): data = self.get_default_params() - data['result'] = 'read_study' + data['result'] = "study" data['fields'] = STUDY_DEFAULT_FIELDS + data['query'] = 'study_accession="{}" OR secondary_study_accession="{}"' \ + .format(study_accession, study_accession) - if primary_accession and not secondary_accession: - data['query'] = 'study_accession="{}"'.format(primary_accession) - elif not primary_accession and secondary_accession: - data['query'] = 'secondary_study_accession="{}"'.format(secondary_accession) - else: - data['query'] = 'study_accession="{}" AND secondary_study_accession="{}"' \ - .format(primary_accession, secondary_accession) - - query_params = [] - for result_type in ['study', 'read_study', 'analysis_study']: - for data_portal in ['ena', 'metagenome']: - param = data.copy() - param['result'] = result_type - param['dataPortal'] = data_portal - if result_type == 'study': - if 'description' in param['fields']: - param['fields'] = param['fields'].replace('description', 'study_description') - query_params.append(param) - - for param in query_params: + data['dataPortal'] = "ena" + + try: + response = self.post_request(data, webin, password) + if response.status_code == 204: + raise NoDataException() try: - response = self.post_request(data, webin, password) - if response.status_code == 204: - raise NoDataException() - try: - study = json.loads(response.text)[0] - except (IndexError, TypeError, ValueError, KeyError) as e: - raise e - if data['result'] == 'study': - if 'study_description' in study: - study['description'] = study.pop('study_description') - return study - except NoDataException: - print("No info found to fetch study with params {}".format(param)) - pass - except (IndexError, TypeError, ValueError, KeyError): - print("Failed to fetch study with params {}, returned error: {}".format(param, response.text)) - - raise ValueError('Could not find study {} {} in ENA.'.format(primary_accession, secondary_accession)) + studyList = response.json() + assert len(studyList) == 1 + study = studyList[0] + except (IndexError, TypeError, ValueError, KeyError) as e: + raise e + return study + except NoDataException: + print("No info found to fetch study {}".format(study_accession)) + except (IndexError, TypeError, ValueError, KeyError): + print("Failed to fetch study {}, returned error: {}".format(study_accession, response.text)) + + raise ValueError('Could not find study {} in ENA.'.format(study_accession)) def get_study_runs(self, study_acc, webin, password, fields=None, search_params=None): data = self.get_default_params() @@ -182,7 +164,7 @@ def get_study_runs(self, study_acc, webin, password, fields=None, search_params= return [] try: - runs = json.loads(response.text) + runs = response.json() except: raise ValueError("Query against ENA API did not work. Returned " "message: {}".format(response.text)) @@ -202,7 +184,9 @@ def get_sample(self, sample_accession, webin, password, fields=None, search_para response = self.post_request(data, webin, password) if response.status_code == 200: - return response.json() + sample = response.json() + assert len(sample) == 1 + return sample[0] if response.status_code == 204: if attempt < 2: @@ -217,7 +201,6 @@ def get_sample(self, sample_accession, webin, password, fields=None, search_para raise ValueError("Could not retrieve sample with accession {}. " "Returned message: {}".format(sample_accession, response.text)) - def query_taxid(self, taxid): url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{}".format(taxid) response = requests.get(url) @@ -310,4 +293,4 @@ def handle_genomes_registration(self, sample_xml, submission_xml, webin, passwor logger.info('{} genome samples successfully registered.'.format(str(len(aliasDict)))) - return aliasDict \ No newline at end of file + return aliasDict diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 2affc1a..72135b9 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -29,9 +29,9 @@ import xml.etree.ElementTree as ET import xml.dom.minidom as minidom -from .ena import ENA +from ena import ENA -from .constants import METAGENOMES, GEOGRAPHIC_LOCATIONS, MQ, HQ +from constants import METAGENOMES, GEOGRAPHIC_LOCATIONS, MQ, HQ logging.basicConfig(level=logging.DEBUG) @@ -39,8 +39,6 @@ ena = ENA() - - ''' Input table: expects the following parameters: genome_name: genome file name @@ -373,20 +371,20 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): except json.decoder.JSONDecodeError: backupDict = {} for s in studySet: - studyInfo = ena.get_study(webin, password, "", s) - - projectDescription = studyInfo["description"] + studyInfo = ena.get_study(webin, password, s) + projectDescription = studyInfo["study_description"] ENA_info = ena.get_study_runs(s, webin, password) if ENA_info == []: raise IOError("No runs found on ENA for project {}.".format(s)) + for run, item in enumerate(ENA_info): runAccession = ENA_info[run]["run_accession"] if runAccession not in backupDict: if runAccession in runsSet: sampleAccession = ENA_info[run]["sample_accession"] sampleInfo = ena.get_sample(sampleAccession, webin, password) - + location = sampleInfo["location"] if 'N' in location: latitude = str(float(location.split('N')[0].strip())) @@ -450,11 +448,6 @@ def combine_ENA_info(genomeInfo, ENADict): samplesList.append(ENADict[run]["sampleAccession"]) longList.append(ENADict[run]["longitude"]) latitList.append(ENADict[run]["latitude"]) - - if multipleElementSet(studyList): - logger.error("The co-assembly your MAG has been generated from comes from " + - "different studies.") - sys.exit(1) genomeInfo[g]["study"] = studyList[0] genomeInfo[g]["description"] = descriptionList[0] @@ -767,7 +760,7 @@ def main(): ENA_uploader = GenomeUpload() if not ENA_uploader.live: - logger.warn("Warning: genome submission is not in live mode, " + + logger.warning("Warning: genome submission is not in live mode, " + "files will be validated, but not uploaded.") xmlGenomeFile, xmlSubFile = "genome_samples.xml", "submission.xml" From a2756d296a6ec53a6056b176706e2bdc7447e8e1 Mon Sep 17 00:00:00 2001 From: Ge94 Date: Sun, 4 Feb 2024 16:28:09 +0000 Subject: [PATCH 08/26] Minor fixes - env variables --- .gitignore | 6 ++++-- genomeuploader/genome_upload.py | 18 ++++++------------ 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index 8fe6e30..9ad78de 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,3 @@ -cluster_uploader_wrapper.py - #IntelliJ project structure files *.iml *.xml @@ -71,6 +69,7 @@ venv/ ENV/ env.bak/ venv.bak/ +.genome_uploader.config.env # Rope project settings .ropeproject @@ -80,3 +79,6 @@ venv.bak/ # Ruff .ruff_cache/ + +#personal +cluster_uploader_wrapper.py diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 72135b9..48b3d1b 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -494,8 +494,6 @@ def combine_ENA_info(genomeInfo, ENADict): genomeInfo[g]["accessions"] = ','.join(genomeInfo[g]["accessions"]) - - def getAccessions(accessionsFile): accessionDict = {} with open(accessionsFile, 'r') as f: @@ -826,15 +824,15 @@ def __init__(self, argv=sys.argv[1:]): self.username = self.args.webin self.password = self.args.password else: - # Config file # - user_config = Path.home() / ".genome_uploader.config" + # Config file + user_config = Path.home() / ".genome_uploader.config.env" if user_config.exists(): - logger.debug(f"Loading the env variables from {user_config}") + logger.debug("Loading the env variables from ".format(user_config)) load_dotenv(str(user_config)) else: - cwd_config = Path.cwd() / ".genome_uploader.config" + cwd_config = Path.cwd() / ".genome_uploader.config.env" if cwd_config.exists(): - logger.debug(f"Loading the variables from the current directory {Path.cwd()}.genome_uploader.config") + logger.debug("Loading the variables from the current directory.") load_dotenv(str(cwd_config)) else: logger.debug("Trying to load env variables from the .env file") @@ -857,11 +855,7 @@ def __init__(self, argv=sys.argv[1:]): def parse_args(self, argv): parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter, - description="Allows to create xmls and manifest files for genome upload to ENA. " + - "--xmls and --manifests are needed to determine the action the script " + - "should perform. The use of more than one option is encouraged. To spare time, " + - "-xmls and -manifests should be called only if respective xml or manifest files " + - "do not already exist.") + description="Create xmls and manifest files for genome upload to ENA. ") parser.add_argument('-u', '--upload_study', type=str, help="Study accession for genomes upload") parser.add_argument('--genome_info', type=str, required=True, help="Genomes metadata file") From 6a0c805727892165d4c0b5fc2c339a057aac0ac7 Mon Sep 17 00:00:00 2001 From: Ge94 Date: Tue, 13 Feb 2024 10:34:20 +0000 Subject: [PATCH 09/26] env name change --- README.md | 2 +- requirements.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a569130..6947509 100755 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ The script needs `python`, `pandas`, `requests`, and `ena-webin-cli` to run. We ```bash # Create environment and install requirements conda env create -f requirements.yml -conda activate genomeUploader +conda activate genome_uploader ``` You can generate pre-upload files with: diff --git a/requirements.yml b/requirements.yml index 55f38b1..0de91a8 100755 --- a/requirements.yml +++ b/requirements.yml @@ -1,4 +1,4 @@ -name: genomeUploader +name: genome_uploader channels: - bioconda - conda-forge From cbf1b6837eca744b4543104c33db2a22d5964e92 Mon Sep 17 00:00:00 2001 From: Ge94 Date: Sun, 18 Feb 2024 06:05:46 +0000 Subject: [PATCH 10/26] Solved bug in euks tax resolution --- genomeuploader/genome_upload.py | 45 ++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 48b3d1b..66a81af 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -180,33 +180,48 @@ def compute_MAG_quality(completeness, contamination, RNApresence): return quality, completeness, contamination def extract_tax_info(taxInfo): + # if unclassified, block the execution + lineage, position, digitAnnotation = taxInfo.split(';'), 0, False + print(lineage) + lineageFirst = lineage[0] + if "Unclassified " in lineageFirst: + if "Archaea" in lineageFirst: + scientificName = "uncultured archaeon" + elif "Bacteria" in lineageFirst: + scientificName = "uncultured bacterium" + elif "Eukaryota" in lineageFirst: + scientificName = "uncultured eukaryote" + submittable, taxid, rank = ena.query_scientific_name(scientificName, searchRank=True) + return taxid, scientificName + kingdoms = ["Archaea", "Bacteria", "Eukaryota"] kingdomTaxa = ["2157", "2", "2759"] - lineage, position, digitAnnotation = taxInfo.split(';'), 0, False selectedKingdom, finalKingdom = kingdoms, "" - if lineage[-1].isdigit(): + if lineage[1].isdigit(): selectedKingdom = kingdomTaxa - position = 1 + position = 2 digitAnnotation = True for index, k in enumerate(selectedKingdom): - if k in lineage[position]: - finalKingdom = kingdoms[index] - + if digitAnnotation: + if k == lineage[position]: + finalKingdom = kingdoms[index] + break + else: + if k in lineage[position]: + finalKingdom = kingdoms[index] + break + iterator = len(lineage)-1 submittable = False rank = "" while iterator != -1 and not submittable: scientificName = lineage[iterator].strip() - if "Unclassified " in scientificName: - if finalKingdom == "Archaea": - scientificName = "uncultured archaeon" - elif finalKingdom == "Bacteria": - scientificName = "uncultured bacterium" - elif finalKingdom == "Eukaryota": - scientificName = "uncultured eukaryote" - elif digitAnnotation: - scientificName = ena.query_taxid(scientificName) + if digitAnnotation: + if not '*' in scientificName: + scientificName = ena.query_taxid(scientificName) + else: + iterator -= 1 elif "__" in scientificName: scientificName = scientificName.split("__")[1] else: From 22e107f96cd18ec041470cdc281bf290bf2ba11a Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Mon, 19 Feb 2024 16:08:19 +0000 Subject: [PATCH 11/26] conflicta in reqs --- requirements.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements.yml b/requirements.yml index 0de91a8..d32e70c 100755 --- a/requirements.yml +++ b/requirements.yml @@ -1,11 +1,10 @@ -name: genome_uploader channels: - bioconda - conda-forge - defaults dependencies: - requests=2.26.0 - - pandas=1.4.1 + - pandas=1.3.3 - ena-webin-cli - python-dotenv=1.0.1 From fb5d158ae687dae31c40e43697912c179d2ce7b0 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Mon, 19 Feb 2024 17:34:16 +0000 Subject: [PATCH 12/26] Pin python in the requirements.yml file --- requirements.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.yml b/requirements.yml index 0de91a8..dc242f3 100755 --- a/requirements.yml +++ b/requirements.yml @@ -4,6 +4,7 @@ channels: - conda-forge - defaults dependencies: + - python=3.10 - requests=2.26.0 - pandas=1.4.1 - ena-webin-cli From e5bc45c52b4d3fd10d2e3111582093cf43b28716 Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Tue, 20 Feb 2024 11:29:03 +0000 Subject: [PATCH 13/26] fix lon and lat digits --- genomeuploader/genome_upload.py | 14 ++++++++------ requirements.yml | 5 +++-- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 66a81af..6cc525d 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -39,6 +39,8 @@ ena = ENA() +GEOGRAPHY_DIGIT_COORDS = 8 + ''' Input table: expects the following parameters: genome_name: genome file name @@ -402,19 +404,19 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): location = sampleInfo["location"] if 'N' in location: - latitude = str(float(location.split('N')[0].strip())) - longitude = location.split('N')[1].strip() + latitude = str(round(float(location.split('N')[0].strip()), GEOGRAPHY_DIGIT_COORDS)) + longitude = str(round(float(location.split('N')[1].strip()), GEOGRAPHY_DIGIT_COORDS)) elif 'S' in location: - latitude = '-' + str(float(location.split('S')[0].strip())) - longitude = location.split('S')[1].strip() + latitude = '-' + str(round(float(location.split('S')[0].strip()), GEOGRAPHY_DIGIT_COORDS)) + longitude = str(round(float(location.split('S')[1].strip()), GEOGRAPHY_DIGIT_COORDS)) else: latitude = "not provided" longitude = "not provided" if 'W' in longitude: - longitude = '-' + str(float(longitude.split('W')[0].strip())) + longitude = '-' + str(round(float(longitude.split('W')[0].strip()), GEOGRAPHY_DIGIT_COORDS)) elif longitude.endswith('E'): - longitude = str(float(longitude.split('E')[0].strip())) + longitude = str(round(float(longitude.split('E')[0].strip()), GEOGRAPHY_DIGIT_COORDS)) country = sampleInfo["country"].split(':')[0] if not country in GEOGRAPHIC_LOCATIONS: diff --git a/requirements.yml b/requirements.yml index d32e70c..73576b3 100755 --- a/requirements.yml +++ b/requirements.yml @@ -1,10 +1,11 @@ +name: genome_uploader channels: - bioconda - conda-forge - defaults dependencies: + - python=3.10 - requests=2.26.0 - - pandas=1.3.3 + - pandas=1.4.1 - ena-webin-cli - python-dotenv=1.0.1 - From 1eb12b9b26d612eb4686078882064297c18c0885 Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Tue, 20 Feb 2024 11:35:34 +0000 Subject: [PATCH 14/26] rm print lineage --- genomeuploader/genome_upload.py | 39 ++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 6cc525d..a893a33 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -184,7 +184,6 @@ def compute_MAG_quality(completeness, contamination, RNApresence): def extract_tax_info(taxInfo): # if unclassified, block the execution lineage, position, digitAnnotation = taxInfo.split(';'), 0, False - print(lineage) lineageFirst = lineage[0] if "Unclassified " in lineageFirst: if "Archaea" in lineageFirst: @@ -384,7 +383,7 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): try: backupDict = json.load(file) tempDict = dict(backupDict) - logger.info("A backup file for ENA sample metadata has been found.") + logger.info(f"A backup file {backupFile} for ENA sample metadata has been found.") except json.decoder.JSONDecodeError: backupDict = {} for s in studySet: @@ -403,21 +402,29 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): sampleInfo = ena.get_sample(sampleAccession, webin, password) location = sampleInfo["location"] + latitude, longitude = None, None if 'N' in location: - latitude = str(round(float(location.split('N')[0].strip()), GEOGRAPHY_DIGIT_COORDS)) - longitude = str(round(float(location.split('N')[1].strip()), GEOGRAPHY_DIGIT_COORDS)) + latitude = location.split('N')[0].strip() + longitude = location.split('N')[1].strip() elif 'S' in location: - latitude = '-' + str(round(float(location.split('S')[0].strip()), GEOGRAPHY_DIGIT_COORDS)) - longitude = str(round(float(location.split('S')[1].strip()), GEOGRAPHY_DIGIT_COORDS)) + latitude = '-' + location.split('S')[0].strip() + longitude = location.split('S')[1].strip() + + if 'W' in longitude: + longitude = '-' + longitude.split('W')[0].strip() + elif longitude.endswith('E'): + longitude = longitude.split('E')[0].strip() + + if latitude: + latitude = "{:.{}f}".format(round(float(latitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS) else: latitude = "not provided" + + if longitude: + longitude = "{:.{}f}".format(round(float(longitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS) + else: longitude = "not provided" - if 'W' in longitude: - longitude = '-' + str(round(float(longitude.split('W')[0].strip()), GEOGRAPHY_DIGIT_COORDS)) - elif longitude.endswith('E'): - longitude = str(round(float(longitude.split('E')[0].strip()), GEOGRAPHY_DIGIT_COORDS)) - country = sampleInfo["country"].split(':')[0] if not country in GEOGRAPHIC_LOCATIONS: country = "not provided" @@ -425,7 +432,7 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): collectionDate = sampleInfo["collection_date"] if collectionDate == "": collectionDate = "not provided" - + tempDict[runAccession] = { "instrumentModel" : ENA_info[run]["instrument_model"], "collectionDate" : collectionDate, @@ -442,7 +449,6 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): file.seek(0) file.write(json.dumps(tempDict)) file.truncate() - tempDict = {**tempDict, **backupDict} combine_ENA_info(genomeInfo, tempDict) @@ -487,12 +493,12 @@ def combine_ENA_info(genomeInfo, ENADict): latitude = latitList[0] if multipleElementSet(latitList): latitude = "not provided" - genomeInfo[g]["latitude"] = latitude + genomeInfo[g]["latitude"] = str(round(float(latitude), GEOGRAPHY_DIGIT_COORDS)) longitude = longList[0] if multipleElementSet(longList): longitude = "not provided" - genomeInfo[g]["longitude"] = longitude + genomeInfo[g]["longitude"] = str(round(float(longitude), GEOGRAPHY_DIGIT_COORDS)) samples = samplesList[0] if multipleElementSet(samplesList): @@ -628,6 +634,7 @@ def create_sample_attribute(sample_attributes, data_list, mag_data=None): new_sample_attr = ET.SubElement(sample_attributes, "SAMPLE_ATTRIBUTE") ET.SubElement(new_sample_attr, 'TAG').text = tag ET.SubElement(new_sample_attr, 'VALUE').text = value + if units: ET.SubElement(new_sample_attr, 'UNITS').text = units @@ -914,9 +921,11 @@ def create_genome_dictionary(self, samples_xml): logger.info('Retrieving data for MAG submission...') genomeInfo = extract_genomes_info(self.genomeMetadata, self.genomeType, self.live) + if not os.path.exists(samples_xml) or self.force: extract_ENA_info(genomeInfo, self.upload_dir, self.username, self.password) logger.info("Writing genome registration XML...") + write_genomes_xml(genomeInfo, samples_xml, self.genomeType, self.centre_name, self.tpa) logger.info("All files have been written to " + self.upload_dir) From 6e5a08a242ccbaa7b03573eebbc86109ea60be7b Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Tue, 5 Mar 2024 14:58:58 +0000 Subject: [PATCH 15/26] add check for missing collection date --- genomeuploader/genome_upload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index a893a33..076a43f 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -430,7 +430,7 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): country = "not provided" collectionDate = sampleInfo["collection_date"] - if collectionDate == "": + if collectionDate == "" or collectionDate == "missing": collectionDate = "not provided" tempDict[runAccession] = { From 47cb39b85b175f97eb58f59763d794331ecdfd2e Mon Sep 17 00:00:00 2001 From: Metagenomics Production Date: Thu, 4 Apr 2024 15:05:36 +0100 Subject: [PATCH 16/26] Fixed bugs for euks taxonomy --- genomeuploader/genome_upload.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 076a43f..346e813 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -206,11 +206,11 @@ def extract_tax_info(taxInfo): for index, k in enumerate(selectedKingdom): if digitAnnotation: if k == lineage[position]: - finalKingdom = kingdoms[index] + finalKingdom = selectedKingdom[index] break else: if k in lineage[position]: - finalKingdom = kingdoms[index] + finalKingdom = selectedKingdom[index] break iterator = len(lineage)-1 @@ -223,6 +223,7 @@ def extract_tax_info(taxInfo): scientificName = ena.query_taxid(scientificName) else: iterator -= 1 + continue elif "__" in scientificName: scientificName = scientificName.split("__")[1] else: @@ -230,12 +231,13 @@ def extract_tax_info(taxInfo): submittable, taxid, rank = ena.query_scientific_name(scientificName, searchRank=True) if not submittable: - if finalKingdom == "Archaea": + if finalKingdom == "Archaea" or finalKingdom == "2157": submittable, scientificName, taxid = extract_Archaea_info(scientificName, rank) - elif finalKingdom == "Bacteria": + elif finalKingdom == "Bacteria" or finalKingdom == "2": submittable, scientificName, taxid = extract_Bacteria_info(scientificName, rank) - elif finalKingdom == "Eukaryota": + elif finalKingdom == "Eukaryota" or finalKingdom == "2759": submittable, scientificName, taxid = extract_Eukaryota_info(scientificName, rank) + iterator -= 1 return taxid, scientificName From 91683158570575197f5095a453de3fe003d900b0 Mon Sep 17 00:00:00 2001 From: Ge94 Date: Thu, 4 Apr 2024 17:22:39 +0100 Subject: [PATCH 17/26] Minor bug fix test/live mode --- genomeuploader/genome_upload.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 346e813..5c63229 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -579,12 +579,13 @@ def recover_info_from_xml(genomeDict, sample_xml, live_mode): # extract alias from xml and find a match with genomes the user is uploading XMLalias = s.attributes["alias"].value - aliasSplit = XMLalias.split("_") - XMLgenomeName = '_'.join(aliasSplit[:-1]) + if not live_mode: # remove time stamp if test mode is selected + aliasSplit = XMLalias.split("_") + XMLalias = '_'.join(aliasSplit[:-1]) for gen in genomeDict: # if match is found, associate attributes listed in the xml file # with genomes to upload - if XMLgenomeName == gen: + if XMLalias == gen: if not live_mode: currentTimestamp = str(int(dt.timestamp(dt.now()))) XMLalias = gen + '_' + currentTimestamp From b9f3c71cd86ebcf9c15d3c696057d5b7624a7c3d Mon Sep 17 00:00:00 2001 From: Ge94 Date: Mon, 22 Apr 2024 17:28:52 +0100 Subject: [PATCH 18/26] Minor logic refactor --- genomeuploader/genome_upload.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 5c63229..da856ee 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -95,7 +95,6 @@ def read_and_cleanse_metadata_tsv(inputFile, genomeType, live): accessionComparison = pd.DataFrame(columns=["genome_name", "attemptive_accessions", "correct", "mismatching", "co-assembly"]) accessionComparison["genome_name"] = metadata["genome_name"] - accessionComparison["co-assembly"] = metadata["co-assembly"] accessionComparison["attemptive_accessions"] = metadata["accessions"].map( lambda a: len(a.split(','))) @@ -121,6 +120,7 @@ def read_and_cleanse_metadata_tsv(inputFile, genomeType, live): raise ValueError("Completeness, contamination or coverage values should be formatted as floats") # check whether all co-assemblies have more than one run associated and viceversa + accessionComparison["co-assembly"] = metadata["co-assembly"] coassemblyDiscrepancy = metadata[( (accessionComparison["correct"] < 2) & (accessionComparison["co-assembly"])) | ((accessionComparison["correct"] > 1) & (~accessionComparison["co-assembly"]) @@ -333,11 +333,12 @@ def extract_genomes_info(inputFile, genomeType, live): genomeInfo[gen]["isolationSource"] = genomeInfo[gen]["metagenome"] try: - quality, compl, cont = compute_MAG_quality(genomeInfo[gen]["completeness"], - genomeInfo[gen]["contamination"], genomeInfo[gen]["rRNA_presence"]) + genomeInfo[gen]["completeness"] = str(round_stats(genomeInfo[gen]["completeness"])) + genomeInfo[gen]["contamination"] = str(round_stats(genomeInfo[gen]["contamination"])) + + quality = compute_MAG_quality(genomeInfo[gen]["completeness"], + genomeInfo[gen]["contamination"], genomeInfo[gen]["rRNA_presence"]) genomeInfo[gen]["MAG_quality"] = quality - genomeInfo[gen]["completeness"] = compl - genomeInfo[gen]["contamination"] = cont except IndexError: pass From abebc695272d45aa7adbc9812d5a1fd62b047d49 Mon Sep 17 00:00:00 2001 From: Ge94 Date: Thu, 25 Apr 2024 11:00:35 +0100 Subject: [PATCH 19/26] Added e2e test support, minor code refactoring --- examples/input_example.tsv | 2 +- genomeuploader/genome_upload.py | 17 +++++++---------- pyproject.toml | 13 +++++++++---- requirements-test.yml | 1 + tests/fixtures/ERR6769700_bin.1.fa.gz | 4 ++++ tests/fixtures/bin_upload/bin_backup.json | 12 ++++++++++++ tests/fixtures/input_fixture.tsv | 2 ++ tests/test_checkOutcome_e2e.yaml | 7 +++++++ 8 files changed, 43 insertions(+), 15 deletions(-) create mode 100644 requirements-test.yml create mode 100644 tests/fixtures/ERR6769700_bin.1.fa.gz create mode 100644 tests/fixtures/bin_upload/bin_backup.json create mode 100755 tests/fixtures/input_fixture.tsv create mode 100644 tests/test_checkOutcome_e2e.yaml diff --git a/examples/input_example.tsv b/examples/input_example.tsv index edc7d7d..68d5cff 100755 --- a/examples/input_example.tsv +++ b/examples/input_example.tsv @@ -1,3 +1,3 @@ -genome_name genome_path accessions assembly_software binning_software binning_parameters stats_generation_software completeness contamination genome_coverage metagenome co-assembly broad_environment local_environment environmental_medium rRNA_presence taxonomy_lineage +genome_name genome_path accessions assembly_software binning_software binning_parameters stats_generation_software completeness contamination genome_coverage metagenome co-assembly broad_environment local_environment environmental_medium rRNA_presence NCBI_lineage coassembly2728_3.fa ./coassembly2728_3.fa.gz SRR1622827,SRR1622828 MetaSPAdes_v3.15.3 metaWRAP_v1.3 default EukCC_v2.1.0 98.2 0.2 26.24 skin metagenome True human skin finger False 1;131567;2759;33154;4751;451864;5204;452284;1538075;162474;742845;55193;76775 ERR4647712_megahit_bin.fa ./ERR4647712_megahit_bin.fa.gz ERR4647712 megahit_v1.2.9 MGnify-genomes-generation-pipeline_v1.0.0 default CheckM2_v1.0.1 90.81 0.59 14.2 chicken gut metagenome False chicken gut mucosa True d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__Lactobacillus crispatus diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index da856ee..70acd3a 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -173,11 +173,8 @@ def round_stats(stats): def compute_MAG_quality(completeness, contamination, RNApresence): RNApresent = str(RNApresence).lower() in ["true", "yes", "y"] quality = MQ - if completeness >= 90 and contamination <= 5 and RNApresent: + if float(completeness) >= 90 and float(contamination) <= 5 and RNApresent: quality = HQ - - completeness = str(round_stats(completeness)) - contamination = str(round_stats(contamination)) return quality, completeness, contamination @@ -333,12 +330,12 @@ def extract_genomes_info(inputFile, genomeType, live): genomeInfo[gen]["isolationSource"] = genomeInfo[gen]["metagenome"] try: - genomeInfo[gen]["completeness"] = str(round_stats(genomeInfo[gen]["completeness"])) - genomeInfo[gen]["contamination"] = str(round_stats(genomeInfo[gen]["contamination"])) - - quality = compute_MAG_quality(genomeInfo[gen]["completeness"], - genomeInfo[gen]["contamination"], genomeInfo[gen]["rRNA_presence"]) - genomeInfo[gen]["MAG_quality"] = quality + (genomeInfo[gen]["MAG_quality"], + genomeInfo[gen]["completeness"], + genomeInfo[gen]["contamination"]) = compute_MAG_quality( + str(round_stats(genomeInfo[gen]["completeness"])), + str(round_stats(genomeInfo[gen]["contamination"])), + genomeInfo[gen]["rRNA_presence"]) except IndexError: pass diff --git a/pyproject.toml b/pyproject.toml index bf5256f..546964f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,14 @@ dependencies = [ "python-dotenv==1.0.1" ] +[tool.pytest.ini_options] +minversion = "6.0" +addopts = "-ra -q" +testpaths = [ + "tests", + "integration", +] + [project.optional-dependencies] dev = [ "pre-commit==3.3.3", @@ -59,9 +67,6 @@ ignore = [ "S308", # flake8-bandit suspicious-mark-safe-usage "E501", # pycodestyle line-too-long ] -line-length = 140 -src = ["fetchtool", "tests"] -target-version = "py38" [tool.ruff.flake8-pytest-style] fixture-parentheses = false @@ -83,5 +88,5 @@ current_version = "0.9.0" commit = true tag = true -[[tool.bumpversion.files]] +[tool.bumpversion.files] filename = "genomeuploader/__init__.py" diff --git a/requirements-test.yml b/requirements-test.yml new file mode 100644 index 0000000..333f06c --- /dev/null +++ b/requirements-test.yml @@ -0,0 +1 @@ +-r requirements.yml \ No newline at end of file diff --git a/tests/fixtures/ERR6769700_bin.1.fa.gz b/tests/fixtures/ERR6769700_bin.1.fa.gz new file mode 100644 index 0000000..ffed01b --- /dev/null +++ b/tests/fixtures/ERR6769700_bin.1.fa.gz @@ -0,0 +1,4 @@ +>contig1 +AGAGTAGGCGAGCTAGCGAAC +>contig2 +GATGAGCTACGAGCGATTAAC \ No newline at end of file diff --git a/tests/fixtures/bin_upload/bin_backup.json b/tests/fixtures/bin_upload/bin_backup.json new file mode 100644 index 0000000..fd752a4 --- /dev/null +++ b/tests/fixtures/bin_upload/bin_backup.json @@ -0,0 +1,12 @@ +{ + "ERR6769700": { + "instrumentModel": "DNBSEQ-G400", + "collectionDate": "2019-10-15", + "country": "Spain", + "latitude": "41.170048", + "longitude": "1.168491", + "projectDescription": "Metagenomic raw reads, assemblies, and bins derived from HoloFood chicken ileum samples. Samples selected for this batch were deeply sequenced, and were randomised among trials (feed), age, and breed to overcome batch effect. The samples in this project contributed to the chicken ileum MAG catalogue (project: PRJEB55375 [ERP140264]).", + "study": "ERP131894", + "assembler": "not provided", + "sampleAccession": "SAMEA10130103"} +} \ No newline at end of file diff --git a/tests/fixtures/input_fixture.tsv b/tests/fixtures/input_fixture.tsv new file mode 100755 index 0000000..e331f82 --- /dev/null +++ b/tests/fixtures/input_fixture.tsv @@ -0,0 +1,2 @@ +genome_name genome_path accessions assembly_software binning_software binning_parameters stats_generation_software completeness contamination genome_coverage metagenome co-assembly broad_environment local_environment environmental_medium rRNA_presence NCBI_lineage +ERR6769700_bin.1 ./tests/fixtures/ERR6769700_bin.1.fa.gz ERR6769700 megahit_v1.2.9 MGnify-genomes-generation-pipeline_v1.0.0 default CheckM2_v1.0.1 90.81314 0.59 14.2 chicken gut metagenome False chicken gut mucosa True d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__Lactobacillus crispatus diff --git a/tests/test_checkOutcome_e2e.yaml b/tests/test_checkOutcome_e2e.yaml new file mode 100644 index 0000000..28fb8c5 --- /dev/null +++ b/tests/test_checkOutcome_e2e.yaml @@ -0,0 +1,7 @@ +- name: genomeuploader end to end test + command: "python genomeuploader/genome_upload.py -u ERP159782 --genome_info tests/fixtures/input_fixture.tsv --out tests/fixtures/ --bins --centre_name EMG" + files: + - path: "tests/fixtures/bin_upload/manifests/" + - path: "tests/fixtures/bin_upload/genome_samples.xml" + - path: "tests/fixtures/bin_upload/registered_bins_test.tsv" + - path: "tests/fixtures/bin_upload/submission.xml" From e3efbe3c6496e164928cab8aa3bbdad03d6b14ea Mon Sep 17 00:00:00 2001 From: Ge94 Date: Thu, 25 Apr 2024 11:09:05 +0100 Subject: [PATCH 20/26] Updated release version --- genomeuploader/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genomeuploader/__init__.py b/genomeuploader/__init__.py index aced243..8c0d5d5 100644 --- a/genomeuploader/__init__.py +++ b/genomeuploader/__init__.py @@ -1 +1 @@ -__version__ = "0.0.1" # TODO: pin the correct version \ No newline at end of file +__version__ = "2.0.0" From 4e0a82157bf8b237b285952ec1899a34d8bf8b08 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Thu, 25 Jan 2024 13:42:08 +0000 Subject: [PATCH 21/26] Project restructure WIP --- genomeuploader/genome_upload.py | 195 ++++++++++++++------------------ 1 file changed, 82 insertions(+), 113 deletions(-) diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 70acd3a..d58bb32 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -20,18 +20,16 @@ import argparse import re import json -from pathlib import Path -from datetime import date, datetime as dt - import pandas as pd -from dotenv import load_dotenv +from datetime import date, datetime as dt import xml.etree.ElementTree as ET import xml.dom.minidom as minidom +import requests -from ena import ENA +from .ena import ENA -from constants import METAGENOMES, GEOGRAPHIC_LOCATIONS, MQ, HQ +from .constants import METAGENOMES, GEOGRAPHIC_LOCATIONS, MQ, HQ logging.basicConfig(level=logging.DEBUG) @@ -39,7 +37,10 @@ ena = ENA() -GEOGRAPHY_DIGIT_COORDS = 8 +class NoDataException(ValueError): + pass + + ''' Input table: expects the following parameters: @@ -95,6 +96,7 @@ def read_and_cleanse_metadata_tsv(inputFile, genomeType, live): accessionComparison = pd.DataFrame(columns=["genome_name", "attemptive_accessions", "correct", "mismatching", "co-assembly"]) accessionComparison["genome_name"] = metadata["genome_name"] + accessionComparison["co-assembly"] = metadata["co-assembly"] accessionComparison["attemptive_accessions"] = metadata["accessions"].map( lambda a: len(a.split(','))) @@ -120,7 +122,6 @@ def read_and_cleanse_metadata_tsv(inputFile, genomeType, live): raise ValueError("Completeness, contamination or coverage values should be formatted as floats") # check whether all co-assemblies have more than one run associated and viceversa - accessionComparison["co-assembly"] = metadata["co-assembly"] coassemblyDiscrepancy = metadata[( (accessionComparison["correct"] < 2) & (accessionComparison["co-assembly"])) | ((accessionComparison["correct"] > 1) & (~accessionComparison["co-assembly"]) @@ -173,54 +174,42 @@ def round_stats(stats): def compute_MAG_quality(completeness, contamination, RNApresence): RNApresent = str(RNApresence).lower() in ["true", "yes", "y"] quality = MQ - if float(completeness) >= 90 and float(contamination) <= 5 and RNApresent: + if completeness >= 90 and contamination <= 5 and RNApresent: quality = HQ + + completeness = str(round_stats(completeness)) + contamination = str(round_stats(contamination)) return quality, completeness, contamination def extract_tax_info(taxInfo): - # if unclassified, block the execution - lineage, position, digitAnnotation = taxInfo.split(';'), 0, False - lineageFirst = lineage[0] - if "Unclassified " in lineageFirst: - if "Archaea" in lineageFirst: - scientificName = "uncultured archaeon" - elif "Bacteria" in lineageFirst: - scientificName = "uncultured bacterium" - elif "Eukaryota" in lineageFirst: - scientificName = "uncultured eukaryote" - submittable, taxid, rank = ena.query_scientific_name(scientificName, searchRank=True) - return taxid, scientificName - kingdoms = ["Archaea", "Bacteria", "Eukaryota"] kingdomTaxa = ["2157", "2", "2759"] + lineage, position, digitAnnotation = taxInfo.split(';'), 0, False selectedKingdom, finalKingdom = kingdoms, "" - if lineage[1].isdigit(): + if lineage[-1].isdigit(): selectedKingdom = kingdomTaxa - position = 2 + position = 1 digitAnnotation = True for index, k in enumerate(selectedKingdom): - if digitAnnotation: - if k == lineage[position]: - finalKingdom = selectedKingdom[index] - break - else: - if k in lineage[position]: - finalKingdom = selectedKingdom[index] - break - + if k in lineage[position]: + finalKingdom = kingdoms[index] + iterator = len(lineage)-1 submittable = False rank = "" while iterator != -1 and not submittable: scientificName = lineage[iterator].strip() - if digitAnnotation: - if not '*' in scientificName: - scientificName = ena.query_taxid(scientificName) - else: - iterator -= 1 - continue + if "Unclassified " in scientificName: + if finalKingdom == "Archaea": + scientificName = "uncultured archaeon" + elif finalKingdom == "Bacteria": + scientificName = "uncultured bacterium" + elif finalKingdom == "Eukaryota": + scientificName = "uncultured eukaryote" + elif digitAnnotation: + scientificName = ena.query_taxid(scientificName) elif "__" in scientificName: scientificName = scientificName.split("__")[1] else: @@ -228,13 +217,12 @@ def extract_tax_info(taxInfo): submittable, taxid, rank = ena.query_scientific_name(scientificName, searchRank=True) if not submittable: - if finalKingdom == "Archaea" or finalKingdom == "2157": + if finalKingdom == "Archaea": submittable, scientificName, taxid = extract_Archaea_info(scientificName, rank) - elif finalKingdom == "Bacteria" or finalKingdom == "2": + elif finalKingdom == "Bacteria": submittable, scientificName, taxid = extract_Bacteria_info(scientificName, rank) - elif finalKingdom == "Eukaryota" or finalKingdom == "2759": + elif finalKingdom == "Eukaryota": submittable, scientificName, taxid = extract_Eukaryota_info(scientificName, rank) - iterator -= 1 return taxid, scientificName @@ -330,12 +318,11 @@ def extract_genomes_info(inputFile, genomeType, live): genomeInfo[gen]["isolationSource"] = genomeInfo[gen]["metagenome"] try: - (genomeInfo[gen]["MAG_quality"], - genomeInfo[gen]["completeness"], - genomeInfo[gen]["contamination"]) = compute_MAG_quality( - str(round_stats(genomeInfo[gen]["completeness"])), - str(round_stats(genomeInfo[gen]["contamination"])), - genomeInfo[gen]["rRNA_presence"]) + quality, compl, cont = compute_MAG_quality(genomeInfo[gen]["completeness"], + genomeInfo[gen]["contamination"], genomeInfo[gen]["rRNA_presence"]) + genomeInfo[gen]["MAG_quality"] = quality + genomeInfo[gen]["completeness"] = compl + genomeInfo[gen]["contamination"] = cont except IndexError: pass @@ -383,56 +370,56 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): try: backupDict = json.load(file) tempDict = dict(backupDict) - logger.info(f"A backup file {backupFile} for ENA sample metadata has been found.") + logger.info("A backup file for ENA sample metadata has been found.") except json.decoder.JSONDecodeError: backupDict = {} for s in studySet: - studyInfo = ena.get_study(webin, password, s) - projectDescription = studyInfo["study_description"] + studyInfo = ena.get_study(webin, password, "", s) + + projectDescription = studyInfo["description"] ENA_info = ena.get_study_runs(s, webin, password) if ENA_info == []: raise IOError("No runs found on ENA for project {}.".format(s)) - for run, item in enumerate(ENA_info): runAccession = ENA_info[run]["run_accession"] if runAccession not in backupDict: if runAccession in runsSet: + provided = True sampleAccession = ENA_info[run]["sample_accession"] sampleInfo = ena.get_sample(sampleAccession, webin, password) - + location = sampleInfo["location"] - latitude, longitude = None, None if 'N' in location: - latitude = location.split('N')[0].strip() + latitude = str(float(location.split('N')[0].strip())) longitude = location.split('N')[1].strip() elif 'S' in location: - latitude = '-' + location.split('S')[0].strip() + latitude = '-' + str(float(location.split('S')[0].strip())) longitude = location.split('S')[1].strip() + else: + latitude = "not provided" + longitude = "not provided" + provided = False if 'W' in longitude: - longitude = '-' + longitude.split('W')[0].strip() + longitude = '-' + str(float(longitude.split('W')[0].strip())) elif longitude.endswith('E'): - longitude = longitude.split('E')[0].strip() + longitude = str(float(longitude.split('E')[0].strip())) - if latitude: - latitude = "{:.{}f}".format(round(float(latitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS) - else: - latitude = "not provided" + if provided: + if len(latitude) > 11: + latitude = latitude[:11] + if len(longitude) > 11: + longitude = longitude[:11] - if longitude: - longitude = "{:.{}f}".format(round(float(longitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS) - else: - longitude = "not provided" - country = sampleInfo["country"].split(':')[0] if not country in GEOGRAPHIC_LOCATIONS: country = "not provided" collectionDate = sampleInfo["collection_date"] - if collectionDate == "" or collectionDate == "missing": + if collectionDate == "": collectionDate = "not provided" - + tempDict[runAccession] = { "instrumentModel" : ENA_info[run]["instrument_model"], "collectionDate" : collectionDate, @@ -449,6 +436,7 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): file.seek(0) file.write(json.dumps(tempDict)) file.truncate() + tempDict = {**tempDict, **backupDict} combine_ENA_info(genomeInfo, tempDict) @@ -471,6 +459,11 @@ def combine_ENA_info(genomeInfo, ENADict): samplesList.append(ENADict[run]["sampleAccession"]) longList.append(ENADict[run]["longitude"]) latitList.append(ENADict[run]["latitude"]) + + if multipleElementSet(studyList): + logger.error("The co-assembly your MAG has been generated from comes from " + + "different studies.") + sys.exit(1) genomeInfo[g]["study"] = studyList[0] genomeInfo[g]["description"] = descriptionList[0] @@ -493,12 +486,12 @@ def combine_ENA_info(genomeInfo, ENADict): latitude = latitList[0] if multipleElementSet(latitList): latitude = "not provided" - genomeInfo[g]["latitude"] = str(round(float(latitude), GEOGRAPHY_DIGIT_COORDS)) + genomeInfo[g]["latitude"] = latitude longitude = longList[0] if multipleElementSet(longList): longitude = "not provided" - genomeInfo[g]["longitude"] = str(round(float(longitude), GEOGRAPHY_DIGIT_COORDS)) + genomeInfo[g]["longitude"] = longitude samples = samplesList[0] if multipleElementSet(samplesList): @@ -517,6 +510,8 @@ def combine_ENA_info(genomeInfo, ENADict): genomeInfo[g]["accessions"] = ','.join(genomeInfo[g]["accessions"]) + + def getAccessions(accessionsFile): accessionDict = {} with open(accessionsFile, 'r') as f: @@ -577,13 +572,12 @@ def recover_info_from_xml(genomeDict, sample_xml, live_mode): # extract alias from xml and find a match with genomes the user is uploading XMLalias = s.attributes["alias"].value - if not live_mode: # remove time stamp if test mode is selected - aliasSplit = XMLalias.split("_") - XMLalias = '_'.join(aliasSplit[:-1]) + aliasSplit = XMLalias.split("_") + XMLgenomeName = '_'.join(aliasSplit[:-1]) for gen in genomeDict: # if match is found, associate attributes listed in the xml file # with genomes to upload - if XMLalias == gen: + if XMLgenomeName == gen: if not live_mode: currentTimestamp = str(int(dt.timestamp(dt.now()))) XMLalias = gen + '_' + currentTimestamp @@ -635,7 +629,6 @@ def create_sample_attribute(sample_attributes, data_list, mag_data=None): new_sample_attr = ET.SubElement(sample_attributes, "SAMPLE_ATTRIBUTE") ET.SubElement(new_sample_attr, 'TAG').text = tag ET.SubElement(new_sample_attr, 'VALUE').text = value - if units: ET.SubElement(new_sample_attr, 'UNITS').text = units @@ -783,7 +776,7 @@ def main(): ENA_uploader = GenomeUpload() if not ENA_uploader.live: - logger.warning("Warning: genome submission is not in live mode, " + + logger.warn("Warning: genome submission is not in live mode, " + "files will be validated, but not uploaded.") xmlGenomeFile, xmlSubFile = "genome_samples.xml", "submission.xml" @@ -844,33 +837,8 @@ def __init__(self, argv=sys.argv[1:]): self.genomeMetadata = self.args.genome_info self.genomeType = "bins" if self.args.bins else "MAGs" self.live = True if self.args.live else False - - if self.args.webin and self.args.password: - self.username = self.args.webin - self.password = self.args.password - else: - # Config file - user_config = Path.home() / ".genome_uploader.config.env" - if user_config.exists(): - logger.debug("Loading the env variables from ".format(user_config)) - load_dotenv(str(user_config)) - else: - cwd_config = Path.cwd() / ".genome_uploader.config.env" - if cwd_config.exists(): - logger.debug("Loading the variables from the current directory.") - load_dotenv(str(cwd_config)) - else: - logger.debug("Trying to load env variables from the .env file") - # from a local .env file - load_dotenv() - - self.username = os.getenv("ENA_WEBIN") - self.password = os.getenv("ENA_WEBIN_PASSWORD") - - if not self.username or not self.password: - logger.error("ENA Webin username or password are empty") - sys.exit(1) - + self.username = self.args.webin + self.password = self.args.password self.tpa = True if self.args.tpa else False self.centre_name = self.args.centre_name self.force = True if self.args.force else False @@ -878,9 +846,13 @@ def __init__(self, argv=sys.argv[1:]): workDir = self.args.out if self.args.out else os.getcwd() self.upload_dir = self.generate_genomes_upload_dir(workDir, self.genomeType) - def parse_args(self, argv): + def parse_args(argv): parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter, - description="Create xmls and manifest files for genome upload to ENA. ") + description="Allows to create xmls and manifest files for genome upload to ENA. " + + "--xmls and --manifests are needed to determine the action the script " + + "should perform. The use of more than one option is encouraged. To spare time, " + + "-xmls and -manifests should be called only if respective xml or manifest files " + + "do not already exist.") parser.add_argument('-u', '--upload_study', type=str, help="Study accession for genomes upload") parser.add_argument('--genome_info', type=str, required=True, help="Genomes metadata file") @@ -895,10 +867,9 @@ def parse_args(self, argv): "option allows to validate samples beforehand") parser.add_argument('--tpa', action='store_true', help="Select if uploading TPA-generated genomes") - # Users can provide their credentials and centre name manually or using a config file - parser.add_argument('--webin', required=False, help="Webin id") - parser.add_argument('--password', required=False, help="Webin password") - parser.add_argument('--centre_name', required=False, help="Name of the centre uploading genomes") + parser.add_argument('--webin', required=True, help="Webin id") + parser.add_argument('--password', required=True, help="Webin password") + parser.add_argument('--centre_name', required=True, help="Name of the centre uploading genomes") args = parser.parse_args(argv) @@ -922,11 +893,9 @@ def create_genome_dictionary(self, samples_xml): logger.info('Retrieving data for MAG submission...') genomeInfo = extract_genomes_info(self.genomeMetadata, self.genomeType, self.live) - if not os.path.exists(samples_xml) or self.force: extract_ENA_info(genomeInfo, self.upload_dir, self.username, self.password) logger.info("Writing genome registration XML...") - write_genomes_xml(genomeInfo, samples_xml, self.genomeType, self.centre_name, self.tpa) logger.info("All files have been written to " + self.upload_dir) From 539f2554a7073502304eb7fb0b5219e71c609fad Mon Sep 17 00:00:00 2001 From: Ge94 Date: Sat, 3 Feb 2024 13:51:37 +0000 Subject: [PATCH 22/26] ENA API refactor --- genomeuploader/genome_upload.py | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index d58bb32..1b3939e 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -27,9 +27,9 @@ import xml.dom.minidom as minidom import requests -from .ena import ENA +from ena import ENA -from .constants import METAGENOMES, GEOGRAPHIC_LOCATIONS, MQ, HQ +from constants import METAGENOMES, GEOGRAPHIC_LOCATIONS, MQ, HQ logging.basicConfig(level=logging.DEBUG) @@ -37,11 +37,6 @@ ena = ENA() -class NoDataException(ValueError): - pass - - - ''' Input table: expects the following parameters: genome_name: genome file name @@ -374,13 +369,13 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): except json.decoder.JSONDecodeError: backupDict = {} for s in studySet: - studyInfo = ena.get_study(webin, password, "", s) - - projectDescription = studyInfo["description"] + studyInfo = ena.get_study(webin, password, s) + projectDescription = studyInfo["study_description"] ENA_info = ena.get_study_runs(s, webin, password) if ENA_info == []: raise IOError("No runs found on ENA for project {}.".format(s)) + for run, item in enumerate(ENA_info): runAccession = ENA_info[run]["run_accession"] if runAccession not in backupDict: @@ -388,7 +383,7 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): provided = True sampleAccession = ENA_info[run]["sample_accession"] sampleInfo = ena.get_sample(sampleAccession, webin, password) - + location = sampleInfo["location"] if 'N' in location: latitude = str(float(location.split('N')[0].strip())) @@ -459,11 +454,6 @@ def combine_ENA_info(genomeInfo, ENADict): samplesList.append(ENADict[run]["sampleAccession"]) longList.append(ENADict[run]["longitude"]) latitList.append(ENADict[run]["latitude"]) - - if multipleElementSet(studyList): - logger.error("The co-assembly your MAG has been generated from comes from " + - "different studies.") - sys.exit(1) genomeInfo[g]["study"] = studyList[0] genomeInfo[g]["description"] = descriptionList[0] @@ -776,7 +766,7 @@ def main(): ENA_uploader = GenomeUpload() if not ENA_uploader.live: - logger.warn("Warning: genome submission is not in live mode, " + + logger.warning("Warning: genome submission is not in live mode, " + "files will be validated, but not uploaded.") xmlGenomeFile, xmlSubFile = "genome_samples.xml", "submission.xml" From eb744cded15b6296c7a31ce4d3ad2fae425cf31f Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Mon, 19 Feb 2024 16:08:19 +0000 Subject: [PATCH 23/26] conflicta in reqs --- requirements.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements.yml b/requirements.yml index 73576b3..4bce34b 100755 --- a/requirements.yml +++ b/requirements.yml @@ -1,4 +1,3 @@ -name: genome_uploader channels: - bioconda - conda-forge @@ -6,6 +5,6 @@ channels: dependencies: - python=3.10 - requests=2.26.0 - - pandas=1.4.1 + - pandas=1.3.3 - ena-webin-cli - python-dotenv=1.0.1 From 9826118843fc01fbc7be7515ad0438630cfeb09a Mon Sep 17 00:00:00 2001 From: Ge94 Date: Thu, 25 Apr 2024 11:16:40 +0100 Subject: [PATCH 24/26] Solved conflicts --- genomeuploader/genome_upload.py | 22 +++++++++------------- requirements.yml | 3 ++- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 1b3939e..fa78f2d 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -37,6 +37,8 @@ ena = ENA() +GEOGRAPHY_DIGIT_COORDS = 8 + ''' Input table: expects the following parameters: genome_name: genome file name @@ -386,27 +388,21 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): location = sampleInfo["location"] if 'N' in location: - latitude = str(float(location.split('N')[0].strip())) - longitude = location.split('N')[1].strip() + latitude = str(round(float(location.split('N')[0].strip()), GEOGRAPHY_DIGIT_COORDS)) + longitude = str(round(float(location.split('N')[1].strip()), GEOGRAPHY_DIGIT_COORDS)) elif 'S' in location: - latitude = '-' + str(float(location.split('S')[0].strip())) - longitude = location.split('S')[1].strip() + latitude = '-' + str(round(float(location.split('S')[0].strip()), GEOGRAPHY_DIGIT_COORDS)) + longitude = str(round(float(location.split('S')[1].strip()), GEOGRAPHY_DIGIT_COORDS)) else: latitude = "not provided" longitude = "not provided" provided = False if 'W' in longitude: - longitude = '-' + str(float(longitude.split('W')[0].strip())) + longitude = '-' + str(round(float(longitude.split('W')[0].strip()), GEOGRAPHY_DIGIT_COORDS)) elif longitude.endswith('E'): - longitude = str(float(longitude.split('E')[0].strip())) - - if provided: - if len(latitude) > 11: - latitude = latitude[:11] - if len(longitude) > 11: - longitude = longitude[:11] - + longitude = str(round(float(longitude.split('E')[0].strip()), GEOGRAPHY_DIGIT_COORDS)) + country = sampleInfo["country"].split(':')[0] if not country in GEOGRAPHIC_LOCATIONS: country = "not provided" diff --git a/requirements.yml b/requirements.yml index 4bce34b..73576b3 100755 --- a/requirements.yml +++ b/requirements.yml @@ -1,3 +1,4 @@ +name: genome_uploader channels: - bioconda - conda-forge @@ -5,6 +6,6 @@ channels: dependencies: - python=3.10 - requests=2.26.0 - - pandas=1.3.3 + - pandas=1.4.1 - ena-webin-cli - python-dotenv=1.0.1 From 5e8062b38bd4b96fe4561b9a052d2911c2692c39 Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Tue, 20 Feb 2024 11:35:34 +0000 Subject: [PATCH 25/26] Solved conflicts --- genomeuploader/genome_upload.py | 35 ++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index fa78f2d..82b661b 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -367,7 +367,7 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): try: backupDict = json.load(file) tempDict = dict(backupDict) - logger.info("A backup file for ENA sample metadata has been found.") + logger.info(f"A backup file {backupFile} for ENA sample metadata has been found.") except json.decoder.JSONDecodeError: backupDict = {} for s in studySet: @@ -382,21 +382,32 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): runAccession = ENA_info[run]["run_accession"] if runAccession not in backupDict: if runAccession in runsSet: - provided = True sampleAccession = ENA_info[run]["sample_accession"] sampleInfo = ena.get_sample(sampleAccession, webin, password) location = sampleInfo["location"] + latitude, longitude = None, None if 'N' in location: - latitude = str(round(float(location.split('N')[0].strip()), GEOGRAPHY_DIGIT_COORDS)) - longitude = str(round(float(location.split('N')[1].strip()), GEOGRAPHY_DIGIT_COORDS)) + latitude = location.split('N')[0].strip() + longitude = location.split('N')[1].strip() elif 'S' in location: - latitude = '-' + str(round(float(location.split('S')[0].strip()), GEOGRAPHY_DIGIT_COORDS)) - longitude = str(round(float(location.split('S')[1].strip()), GEOGRAPHY_DIGIT_COORDS)) + latitude = '-' + location.split('S')[0].strip() + longitude = location.split('S')[1].strip() + + if 'W' in longitude: + longitude = '-' + longitude.split('W')[0].strip() + elif longitude.endswith('E'): + longitude = longitude.split('E')[0].strip() + + if latitude: + latitude = "{:.{}f}".format(round(float(latitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS) else: latitude = "not provided" + + if longitude: + longitude = "{:.{}f}".format(round(float(longitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS) + else: longitude = "not provided" - provided = False if 'W' in longitude: longitude = '-' + str(round(float(longitude.split('W')[0].strip()), GEOGRAPHY_DIGIT_COORDS)) @@ -410,7 +421,7 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): collectionDate = sampleInfo["collection_date"] if collectionDate == "": collectionDate = "not provided" - + tempDict[runAccession] = { "instrumentModel" : ENA_info[run]["instrument_model"], "collectionDate" : collectionDate, @@ -427,7 +438,6 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): file.seek(0) file.write(json.dumps(tempDict)) file.truncate() - tempDict = {**tempDict, **backupDict} combine_ENA_info(genomeInfo, tempDict) @@ -472,12 +482,12 @@ def combine_ENA_info(genomeInfo, ENADict): latitude = latitList[0] if multipleElementSet(latitList): latitude = "not provided" - genomeInfo[g]["latitude"] = latitude + genomeInfo[g]["latitude"] = str(round(float(latitude), GEOGRAPHY_DIGIT_COORDS)) longitude = longList[0] if multipleElementSet(longList): longitude = "not provided" - genomeInfo[g]["longitude"] = longitude + genomeInfo[g]["longitude"] = str(round(float(longitude), GEOGRAPHY_DIGIT_COORDS)) samples = samplesList[0] if multipleElementSet(samplesList): @@ -615,6 +625,7 @@ def create_sample_attribute(sample_attributes, data_list, mag_data=None): new_sample_attr = ET.SubElement(sample_attributes, "SAMPLE_ATTRIBUTE") ET.SubElement(new_sample_attr, 'TAG').text = tag ET.SubElement(new_sample_attr, 'VALUE').text = value + if units: ET.SubElement(new_sample_attr, 'UNITS').text = units @@ -879,9 +890,11 @@ def create_genome_dictionary(self, samples_xml): logger.info('Retrieving data for MAG submission...') genomeInfo = extract_genomes_info(self.genomeMetadata, self.genomeType, self.live) + if not os.path.exists(samples_xml) or self.force: extract_ENA_info(genomeInfo, self.upload_dir, self.username, self.password) logger.info("Writing genome registration XML...") + write_genomes_xml(genomeInfo, samples_xml, self.genomeType, self.centre_name, self.tpa) logger.info("All files have been written to " + self.upload_dir) From d777bc8b24c6fd3759b48a5b2b6668c85469cb97 Mon Sep 17 00:00:00 2001 From: Ge94 Date: Thu, 25 Apr 2024 13:05:41 +0100 Subject: [PATCH 26/26] Pre-merge fixes --- genomeuploader/genome_upload.py | 129 ++++++++++++++++++++------------ 1 file changed, 81 insertions(+), 48 deletions(-) diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 82b661b..a70064b 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -22,6 +22,8 @@ import json import pandas as pd from datetime import date, datetime as dt +from dotenv import load_dotenv +from pathlib import Path import xml.etree.ElementTree as ET import xml.dom.minidom as minidom @@ -93,7 +95,6 @@ def read_and_cleanse_metadata_tsv(inputFile, genomeType, live): accessionComparison = pd.DataFrame(columns=["genome_name", "attemptive_accessions", "correct", "mismatching", "co-assembly"]) accessionComparison["genome_name"] = metadata["genome_name"] - accessionComparison["co-assembly"] = metadata["co-assembly"] accessionComparison["attemptive_accessions"] = metadata["accessions"].map( lambda a: len(a.split(','))) @@ -119,6 +120,7 @@ def read_and_cleanse_metadata_tsv(inputFile, genomeType, live): raise ValueError("Completeness, contamination or coverage values should be formatted as floats") # check whether all co-assemblies have more than one run associated and viceversa + accessionComparison["co-assembly"] = metadata["co-assembly"] coassemblyDiscrepancy = metadata[( (accessionComparison["correct"] < 2) & (accessionComparison["co-assembly"])) | ((accessionComparison["correct"] > 1) & (~accessionComparison["co-assembly"]) @@ -171,42 +173,54 @@ def round_stats(stats): def compute_MAG_quality(completeness, contamination, RNApresence): RNApresent = str(RNApresence).lower() in ["true", "yes", "y"] quality = MQ - if completeness >= 90 and contamination <= 5 and RNApresent: + if float(completeness) >= 90 and float(contamination) <= 5 and RNApresent: quality = HQ - completeness = str(round_stats(completeness)) - contamination = str(round_stats(contamination)) - return quality, completeness, contamination def extract_tax_info(taxInfo): + # if unclassified, block the execution + lineage, position, digitAnnotation = taxInfo.split(';'), 0, False + lineageFirst = lineage[0] + if "Unclassified " in lineageFirst: + if "Archaea" in lineageFirst: + scientificName = "uncultured archaeon" + elif "Bacteria" in lineageFirst: + scientificName = "uncultured bacterium" + elif "Eukaryota" in lineageFirst: + scientificName = "uncultured eukaryote" + submittable, taxid, rank = ena.query_scientific_name(scientificName, searchRank=True) + return taxid, scientificName + kingdoms = ["Archaea", "Bacteria", "Eukaryota"] kingdomTaxa = ["2157", "2", "2759"] - lineage, position, digitAnnotation = taxInfo.split(';'), 0, False selectedKingdom, finalKingdom = kingdoms, "" - if lineage[-1].isdigit(): + if lineage[1].isdigit(): selectedKingdom = kingdomTaxa - position = 1 + position = 2 digitAnnotation = True for index, k in enumerate(selectedKingdom): - if k in lineage[position]: - finalKingdom = kingdoms[index] + if digitAnnotation: + if k == lineage[position]: + finalKingdom = selectedKingdom[index] + break + else: + if k in lineage[position]: + finalKingdom = selectedKingdom[index] + break iterator = len(lineage)-1 submittable = False rank = "" while iterator != -1 and not submittable: scientificName = lineage[iterator].strip() - if "Unclassified " in scientificName: - if finalKingdom == "Archaea": - scientificName = "uncultured archaeon" - elif finalKingdom == "Bacteria": - scientificName = "uncultured bacterium" - elif finalKingdom == "Eukaryota": - scientificName = "uncultured eukaryote" - elif digitAnnotation: - scientificName = ena.query_taxid(scientificName) + if digitAnnotation: + if not '*' in scientificName: + scientificName = ena.query_taxid(scientificName) + else: + iterator -= 1 + continue elif "__" in scientificName: scientificName = scientificName.split("__")[1] else: @@ -214,11 +228,11 @@ def extract_tax_info(taxInfo): submittable, taxid, rank = ena.query_scientific_name(scientificName, searchRank=True) if not submittable: - if finalKingdom == "Archaea": + if finalKingdom == "Archaea" or finalKingdom == "2157": submittable, scientificName, taxid = extract_Archaea_info(scientificName, rank) - elif finalKingdom == "Bacteria": + elif finalKingdom == "Bacteria" or finalKingdom == "2": submittable, scientificName, taxid = extract_Bacteria_info(scientificName, rank) - elif finalKingdom == "Eukaryota": + elif finalKingdom == "Eukaryota" or finalKingdom == "2759": submittable, scientificName, taxid = extract_Eukaryota_info(scientificName, rank) iterator -= 1 @@ -315,11 +329,12 @@ def extract_genomes_info(inputFile, genomeType, live): genomeInfo[gen]["isolationSource"] = genomeInfo[gen]["metagenome"] try: - quality, compl, cont = compute_MAG_quality(genomeInfo[gen]["completeness"], - genomeInfo[gen]["contamination"], genomeInfo[gen]["rRNA_presence"]) - genomeInfo[gen]["MAG_quality"] = quality - genomeInfo[gen]["completeness"] = compl - genomeInfo[gen]["contamination"] = cont + (genomeInfo[gen]["MAG_quality"], + genomeInfo[gen]["completeness"], + genomeInfo[gen]["contamination"]) = compute_MAG_quality( + str(round_stats(genomeInfo[gen]["completeness"])), + str(round_stats(genomeInfo[gen]["contamination"])), + genomeInfo[gen]["rRNA_presence"]) except IndexError: pass @@ -408,18 +423,13 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): longitude = "{:.{}f}".format(round(float(longitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS) else: longitude = "not provided" - - if 'W' in longitude: - longitude = '-' + str(round(float(longitude.split('W')[0].strip()), GEOGRAPHY_DIGIT_COORDS)) - elif longitude.endswith('E'): - longitude = str(round(float(longitude.split('E')[0].strip()), GEOGRAPHY_DIGIT_COORDS)) - + country = sampleInfo["country"].split(':')[0] if not country in GEOGRAPHIC_LOCATIONS: country = "not provided" collectionDate = sampleInfo["collection_date"] - if collectionDate == "": + if collectionDate == "" or collectionDate == "missing": collectionDate = "not provided" tempDict[runAccession] = { @@ -568,12 +578,13 @@ def recover_info_from_xml(genomeDict, sample_xml, live_mode): # extract alias from xml and find a match with genomes the user is uploading XMLalias = s.attributes["alias"].value - aliasSplit = XMLalias.split("_") - XMLgenomeName = '_'.join(aliasSplit[:-1]) + if not live_mode: # remove time stamp if test mode is selected + aliasSplit = XMLalias.split("_") + XMLalias = '_'.join(aliasSplit[:-1]) for gen in genomeDict: # if match is found, associate attributes listed in the xml file # with genomes to upload - if XMLgenomeName == gen: + if XMLalias == gen: if not live_mode: currentTimestamp = str(int(dt.timestamp(dt.now()))) XMLalias = gen + '_' + currentTimestamp @@ -834,8 +845,33 @@ def __init__(self, argv=sys.argv[1:]): self.genomeMetadata = self.args.genome_info self.genomeType = "bins" if self.args.bins else "MAGs" self.live = True if self.args.live else False - self.username = self.args.webin - self.password = self.args.password + + if self.args.webin and self.args.password: + self.username = self.args.webin + self.password = self.args.password + else: + # Config file + user_config = Path.home() / ".genome_uploader.config.env" + if user_config.exists(): + logger.debug("Loading the env variables from ".format(user_config)) + load_dotenv(str(user_config)) + else: + cwd_config = Path.cwd() / ".genome_uploader.config.env" + if cwd_config.exists(): + logger.debug("Loading the variables from the current directory.") + load_dotenv(str(cwd_config)) + else: + logger.debug("Trying to load env variables from the .env file") + # from a local .env file + load_dotenv() + + self.username = os.getenv("ENA_WEBIN") + self.password = os.getenv("ENA_WEBIN_PASSWORD") + + if not self.username or not self.password: + logger.error("ENA Webin username or password are empty") + sys.exit(1) + self.tpa = True if self.args.tpa else False self.centre_name = self.args.centre_name self.force = True if self.args.force else False @@ -843,13 +879,9 @@ def __init__(self, argv=sys.argv[1:]): workDir = self.args.out if self.args.out else os.getcwd() self.upload_dir = self.generate_genomes_upload_dir(workDir, self.genomeType) - def parse_args(argv): + def parse_args(self, argv): parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter, - description="Allows to create xmls and manifest files for genome upload to ENA. " + - "--xmls and --manifests are needed to determine the action the script " + - "should perform. The use of more than one option is encouraged. To spare time, " + - "-xmls and -manifests should be called only if respective xml or manifest files " + - "do not already exist.") + description="Create xmls and manifest files for genome upload to ENA") parser.add_argument('-u', '--upload_study', type=str, help="Study accession for genomes upload") parser.add_argument('--genome_info', type=str, required=True, help="Genomes metadata file") @@ -864,9 +896,10 @@ def parse_args(argv): "option allows to validate samples beforehand") parser.add_argument('--tpa', action='store_true', help="Select if uploading TPA-generated genomes") - parser.add_argument('--webin', required=True, help="Webin id") - parser.add_argument('--password', required=True, help="Webin password") - parser.add_argument('--centre_name', required=True, help="Name of the centre uploading genomes") + # Users can provide their credentials and centre name manually or using a config file + parser.add_argument('--webin', required=False, help="Webin id") + parser.add_argument('--password', required=False, help="Webin password") + parser.add_argument('--centre_name', required=False, help="Name of the centre uploading genomes") args = parser.parse_args(argv)