diff --git a/.Dockerfile b/.Dockerfile index bf30cd62..e523d42a 100644 --- a/.Dockerfile +++ b/.Dockerfile @@ -30,5 +30,5 @@ COPY parser ./parser COPY tests ./tests COPY *.py . COPY default.database.ini . -COPY logging.ini . +COPY config/logging.ini . COPY .kubernetes.yml . \ No newline at end of file diff --git a/.github/workflows/pythonpublish.yml b/.github/workflows/pythonpublish.yml new file mode 100644 index 00000000..a7e463b3 --- /dev/null +++ b/.github/workflows/pythonpublish.yml @@ -0,0 +1,44 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +#on: +# release: +# types: [published] +on: + push: + branches: + - pride + workflow_dispatch: + +permissions: + contents: read + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + - name: Publish package + uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 91dcf247..38a72bfb 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ logs/* logs/xi_mzidentml_converter.log !logs/.gitkeep .DS_Store +/dist tests/test.db dbs/xiSPEC\.db @@ -31,3 +32,5 @@ test.db database.ini kubernetes.yml Dockerfile +xi_mzidentml_converter.egg-info +build diff --git a/app/__init__.py b/__init__.py similarity index 100% rename from app/__init__.py rename to __init__.py diff --git a/app/api.py b/app/api.py deleted file mode 100644 index a21953fc..00000000 --- a/app/api.py +++ /dev/null @@ -1,31 +0,0 @@ -from fastapi import FastAPI, HTTPException -from fastapi.middleware.cors import CORSMiddleware - -from app.routes.pride import pride_router -from app.routes.pdbdev import pdbdev_router -from app.routes.xiview import xiview_data_router - -app = FastAPI(title="xi-mzidentml-converter WS", - description="This is an API to crosslinking archive", - version="0.0.1", - license_info={ - "name": "Apache 2.0", - "url": "https://www.apache.org/licenses/LICENSE-2.0.html", - }, - openapi_url="/pride/archive/xiview/ws/openapi.json", - docs_url="/pride/archive/xiview/ws/docs") - -# Set up CORS middleware -origins = ["*"] # Update this with your allowed origins - -app.add_middleware( - CORSMiddleware, - allow_origins=origins, - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - -app.include_router(pride_router, prefix="/pride/archive/xiview/ws") -app.include_router(pdbdev_router, prefix="/pride/archive/xiview/ws/pdbdev") -app.include_router(xiview_data_router, prefix="/pride/archive/xiview/ws/data") diff --git a/app/routes/pdbdev.py b/app/routes/pdbdev.py deleted file mode 100644 index 632da24c..00000000 --- a/app/routes/pdbdev.py +++ /dev/null @@ -1,137 +0,0 @@ -import psycopg2 -from fastapi import APIRouter -from psycopg2.extras import RealDictCursor -import logging - -from app.routes.shared import get_db_connection, get_most_recent_upload_ids - -pdbdev_router = APIRouter() - -app_logger = logging.getLogger(__name__) - -@pdbdev_router.get('/projects/{project_id}/sequences', tags=["PDB-Dev"]) -async def sequences(project_id): - """ - Get all sequences belonging to a project. - - :param project_id: identifier of a project, - for ProteomeXchange projects this is the PXD****** accession - :return: JSON object with all dbref id, mzIdentML file it came from and sequences - """ - - most_recent_upload_ids = await get_most_recent_upload_ids(project_id) - - conn = None - mzid_rows = [] - try: - # connect to the PostgreSQL server and create a cursor - conn = await get_db_connection() - cur = conn.cursor(cursor_factory=RealDictCursor) - - sql = """SELECT dbseq.id, u.identification_file_name, dbseq.sequence - FROM upload AS u - JOIN dbsequence AS dbseq ON u.id = dbseq.upload_id - INNER JOIN peptideevidence pe ON dbseq.id = pe.dbsequence_ref AND dbseq.upload_id = pe.upload_id - WHERE u.id = ANY (%s) - AND pe.is_decoy = false - GROUP by dbseq.id, dbseq.sequence, u.identification_file_name;""" - - print(sql) - cur.execute(sql, [most_recent_upload_ids]) - mzid_rows = cur.fetchall() - - print("finished") - # close the communication with the PostgreSQL - cur.close() - except (Exception, psycopg2.DatabaseError) as error: - print(error) - finally: - if conn is not None: - conn.close() - print('Database connection closed.') - return {"data": mzid_rows} - - -@pdbdev_router.get('/projects/{project_id}/residue-pairs/based-on-reported-psm-level/{passing_threshold}', tags=["PDB-Dev"]) -async def get_psm_level_residue_pairs(project_id, passing_threshold): - """ - Get all residue pairs (based on PSM level data) belonging to a project. - - There will be multiple entries for identifications with - positional uncertainty of peptide in protein sequences. - :param project_id: identifier of a project, - for ProteomeXchange projects this is the PXD****** accession - :param passing_threshold: valid values: passing, all - if 'passing' return residue pairs that passed the threshold - if 'all' return all residue pairs - :return: - """ - if passing_threshold not in ['passing', 'all']: - return f"Invalid value for passing_threshold: {passing_threshold}. " \ - f"Valid values are: passing, all", 400 - - most_recent_upload_ids = await get_most_recent_upload_ids(project_id) - - conn = None - data = {} - try: - # connect to the PostgreSQL server and create a cursor - conn = await get_db_connection() - cur = conn.cursor(cursor_factory=RealDictCursor) - - sql = """SELECT si.id, u.identification_file_name as file, si.pass_threshold as pass, -pe1.dbsequence_ref as prot1, (pe1.pep_start + mp1.link_site1 - 1) as pos1, -pe2.dbsequence_ref as prot2, (pe2.pep_start + mp2.link_site1 - 1) as pos2 -FROM spectrumidentification si INNER JOIN -modifiedpeptide mp1 ON si.pep1_id = mp1.id AND si.upload_id = mp1.upload_id INNER JOIN -peptideevidence pe1 ON mp1.id = pe1.peptide_ref AND mp1.upload_id = pe1.upload_id INNER JOIN -modifiedpeptide mp2 ON si.pep2_id = mp2.id AND si.upload_id = mp2.upload_id INNER JOIN -peptideevidence pe2 ON mp2.id = pe2.peptide_ref AND mp2.upload_id = pe2.upload_id INNER JOIN -upload u on u.id = si.upload_id -where u.id = ANY (%s) and mp1.link_site1 > 0 and mp2.link_site1 > 0 AND pe1.is_decoy = false AND pe2.is_decoy = false -""" - # problem above in u.project_id - - if passing_threshold.lower() == 'passing': - sql += " AND si.pass_threshold = true" - sql += ";" - print(sql) - cur.execute(sql, [most_recent_upload_ids]) - mzid_rows = cur.fetchall() - data["data"] = mzid_rows - - print("finished") - # close the communication with the PostgreSQL - cur.close() - except (Exception, psycopg2.DatabaseError) as error: - print(error) - finally: - if conn is not None: - conn.close() - print('Database connection closed.') - return data - -# -# @pdb_dev_router.get('/projects/{project_id}/residue-pairs/reported') -# def get_reported_residue_pairs(project_id): -# """ -# Get all residue-pairs reported for a project -# from the ProteinDetectionList element(s). -# -# :param project_id: identifier of a project, -# for ProteomeXchange projects this is the PXD****** accession -# :return: -# """ -# return "Not Implemented", 501 - - -@pdbdev_router.get('/projects/{project_id}/reported-thresholds', tags=["PDB-Dev"]) -async def get_reported_thresholds(project_id): - """ - Get all reported thresholds for a project. - - :param project_id: identifier of a project, - for ProteomeXchange projects this is the PXD****** accession - :return: - """ - return "Not Implemented", 501 diff --git a/app/routes/pride.py b/app/routes/pride.py deleted file mode 100644 index 5d0723b1..00000000 --- a/app/routes/pride.py +++ /dev/null @@ -1,893 +0,0 @@ -import configparser -import os -from typing import List -import sys - -import requests -from fastapi import APIRouter, Depends, status -from fastapi import HTTPException, Security -from fastapi.security import APIKeyHeader -from sqlalchemy import text -from sqlalchemy.orm import Session, joinedload -from app.models.upload import Upload -from app.models.analysiscollection import AnalysisCollection -from app.models.dbsequence import DBSequence -from app.models.enzyme import Enzyme -from app.models.modifiedpeptide import ModifiedPeptide -from app.models.peptideevidence import PeptideEvidence -from app.models.projectdetail import ProjectDetail -from app.models.projectsubdetail import ProjectSubDetail -from app.models.searchmodification import SearchModification -from app.models.spectrum import Spectrum -from app.models.spectrumidentification import SpectrumIdentification -from app.models.spectrumidentificationprotocol import SpectrumIdentificationProtocol -from index import get_session -from process_dataset import convert_pxd_accession_from_pride -from db_config_parser import security_API_key - -import logging -import logging.config - -logger = logging.getLogger(__name__) - -api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False) -pride_router = APIRouter() -config = configparser.ConfigParser() - - -def get_api_key(key: str = Security(api_key_header)) -> str: - if key == security_API_key(): - return key - raise HTTPException( - status_code=status.HTTP_401_UNAUTHORIZED, - detail="Invalid or missing API Key", - ) - - -@pride_router.get("/health", tags=["Admin"]) -async def health(session: Session = Depends(get_session)): - """ - A quick simple endpoint to test the API is working - :return: Response with OK - """ - - sql_projects_count = text(""" - SELECT - COUNT(id) AS "Number of Projects" - FROM - projectdetails p; - """) - try: - count = await get_projects_count(sql_projects_count, session) - if count[0] is not None: - db_status = "OK" - else: - db_status = "Failed" - except Exception as error: - logger.error(error) - db_status = "Failed" - logger.info('Health check endpoint accessed') - logger.debug('Health check endpoint accessed - debug') - return {'status': "OK", - 'db_status': db_status} - - -@pride_router.post("/parse", tags=["Admin"]) -async def parse(px_accession: str, temp_dir: str | None = None, dont_delete: bool = False, - api_key: str = Security(get_api_key)): - """ - Parse a new project which contain MzIdentML file - :param api_key: API KEY - :param px_accession: ProteomXchange Project Accession - :param temp_dir: If data needs to be saved in a temporary directory - :param dont_delete: Boolean value to determine if the files needs to be deleted at the end - :return: None - """ - if temp_dir: - temp_dir = os.path.expanduser(temp_dir) - else: - temp_dir = os.path.expanduser('~/mzId_convertor_temp') - convert_pxd_accession_from_pride(px_accession, temp_dir, dont_delete) - - -@pride_router.post("/update-protein-metadata/{project_id}", tags=["Admin"]) -async def update_metadata_by_project(project_id: str, session: Session = Depends(get_session), - api_key: str = Security(get_api_key)): - # Get total number of identifications (not spectra) passing Threshold including decoy identification - sql_number_of_identifications = text(""" - SELECT count(*) - FROM spectrumidentification - WHERE upload_id IN ( - SELECT u.id - FROM upload u - WHERE u.upload_time = ( - SELECT max(upload_time) - FROM upload - WHERE project_id = u.project_id AND identification_file_name = u.identification_file_name - ) - AND u.project_id = :projectaccession - ) - AND pass_threshold = True - """) - - # Get Total number of peptides identified passing Threshold including decoy identification(including non-crosslink) - sql_number_of_peptides = text(""" - SELECT COUNT(DISTINCT pep_id) - FROM ( - SELECT pep1_id AS pep_id - FROM spectrumidentification si - WHERE si.upload_id IN ( - SELECT u.id - FROM upload u - where u.upload_time = - (select max(upload_time) from upload - where project_id = u.project_id - and identification_file_name = u.identification_file_name ) - and u.project_id = :projectaccession - ) AND si.pass_threshold = TRUE - - UNION - - SELECT pep2_id AS pep_id - FROM spectrumidentification si - WHERE si.upload_id IN ( - SELECT u.id - FROM upload u - where u.upload_time = - (select max(upload_time) from upload - where project_id = u.project_id - and identification_file_name = u.identification_file_name ) - and u.project_id = :projectaccession - ) AND si.pass_threshold = TRUE - ) AS result; - """) - - # Total number of crosslinked proteins identified passing Threshold excluding decoy identification - sql_number_of_proteins = text(""" - SELECT COUNT(*) - FROM ( - SELECT DISTINCT dbs.accession - FROM ( - SELECT pe1.dbsequence_ref AS protein_id - FROM spectrumidentification si - INNER JOIN modifiedpeptide mp1 ON si.pep1_id = mp1.id AND si.upload_id = mp1.upload_id - INNER JOIN peptideevidence pe1 ON mp1.id = pe1.peptide_ref AND mp1.upload_id = pe1.upload_id - INNER JOIN modifiedpeptide mp2 ON si.pep2_id = mp2.id AND si.upload_id = mp2.upload_id - INNER JOIN peptideevidence pe2 ON mp2.id = pe2.peptide_ref AND mp2.upload_id = pe2.upload_id - INNER JOIN upload u ON u.id = si.upload_id - WHERE u.id IN ( - SELECT u.id - FROM upload u - WHERE u.upload_time = ( - SELECT MAX(upload_time) - FROM upload - WHERE project_id = u.project_id - AND identification_file_name = u.identification_file_name - ) AND u.project_id = :projectaccession - ) AND mp1.link_site1 > 0 AND mp2.link_site1 > 0 AND pe1.is_decoy = FALSE AND pe2.is_decoy = FALSE - AND si.pass_threshold = TRUE - - UNION - - SELECT pe2.dbsequence_ref AS protein_id - FROM spectrumidentification si - INNER JOIN modifiedpeptide mp1 ON si.pep1_id = mp1.id AND si.upload_id = mp1.upload_id - INNER JOIN peptideevidence pe1 ON mp1.id = pe1.peptide_ref AND mp1.upload_id = pe1.upload_id - INNER JOIN modifiedpeptide mp2 ON si.pep2_id = mp2.id AND si.upload_id = mp2.upload_id - INNER JOIN peptideevidence pe2 ON mp2.id = pe2.peptide_ref AND mp2.upload_id = pe2.upload_id - INNER JOIN upload u ON u.id = si.upload_id - WHERE u.id IN ( - SELECT u.id - FROM upload u - WHERE u.upload_time = ( - SELECT MAX(upload_time) - FROM upload - WHERE project_id = u.project_id - AND identification_file_name = u.identification_file_name - ) AND u.project_id = :projectaccession - ) AND mp1.link_site1 > 0 AND mp2.link_site1 > 0 AND pe1.is_decoy = FALSE AND pe2.is_decoy = FALSE - AND si.pass_threshold = TRUE - ) AS protein_id - INNER JOIN dbsequence AS dbs ON protein_id = id - ) AS accessions; - - """) - - # total number of peptides(crosslink and non crosslink) per protein - sql_peptides_per_protein = text(""" - WITH result AS ( - SELECT - pe1.dbsequence_ref AS dbref1, - pe1.peptide_ref AS pepref1, - pe2.dbsequence_ref AS dbref2, - pe2.peptide_ref AS pepref2 - FROM - spectrumidentification si - INNER JOIN modifiedpeptide mp1 ON si.pep1_id = mp1.id AND si.upload_id = mp1.upload_id - INNER JOIN peptideevidence pe1 ON mp1.id = pe1.peptide_ref AND mp1.upload_id = pe1.upload_id - INNER JOIN modifiedpeptide mp2 ON si.pep2_id = mp2.id AND si.upload_id = mp2.upload_id - INNER JOIN peptideevidence pe2 ON mp2.id = pe2.peptide_ref AND mp2.upload_id = pe2.upload_id - INNER JOIN upload u ON u.id = si.upload_id - WHERE - u.id IN ( - SELECT u.id - FROM upload u - WHERE u.upload_time = ( - SELECT MAX(upload_time) - FROM upload - WHERE project_id = u.project_id - AND identification_file_name = u.identification_file_name - ) AND u.project_id = :projectaccession - ) - AND pe1.is_decoy = FALSE - AND pe2.is_decoy = FALSE - AND si.pass_threshold = TRUE - ) - SELECT - dbref, - COUNT(pepref) AS peptide_count - FROM ( - SELECT dbref1 AS dbref, pepref1 AS pepref FROM result - UNION - SELECT dbref2 AS dbref, pepref2 AS pepref FROM result - ) AS inner_result - GROUP BY dbref; - - """) - - # Proteins and crosslink peptide counts - sql_crosslinks_per_protein = text(""" - WITH result AS ( - SELECT - pe1.dbsequence_ref AS dbref1, - pe1.peptide_ref AS pepref1, - pe2.dbsequence_ref AS dbref2, - pe2.peptide_ref AS pepref2 - FROM - spectrumidentification si - INNER JOIN modifiedpeptide mp1 ON si.pep1_id = mp1.id AND si.upload_id = mp1.upload_id - INNER JOIN peptideevidence pe1 ON mp1.id = pe1.peptide_ref AND mp1.upload_id = pe1.upload_id - INNER JOIN modifiedpeptide mp2 ON si.pep2_id = mp2.id AND si.upload_id = mp2.upload_id - INNER JOIN peptideevidence pe2 ON mp2.id = pe2.peptide_ref AND mp2.upload_id = pe2.upload_id - INNER JOIN upload u ON u.id = si.upload_id - WHERE - u.id IN ( - SELECT u.id - FROM upload u - WHERE u.upload_time = ( - SELECT MAX(upload_time) - FROM upload - WHERE project_id = u.project_id - AND identification_file_name = u.identification_file_name - ) AND u.project_id = :projectaccession - ) - AND mp1.link_site1 > 0 - AND mp2.link_site1 > 0 - AND pe1.is_decoy = FALSE - AND pe2.is_decoy = FALSE - AND si.pass_threshold = TRUE - ) - SELECT - dbref, - COUNT(pepref) AS peptide_count - FROM ( - SELECT DISTINCT * - FROM ( - SELECT dbref1 AS dbref, pepref1 AS pepref FROM result - UNION - SELECT dbref2 AS dbref, pepref2 AS pepref FROM result - ) AS inner_inner_result - ) AS inner_result - GROUP BY dbref; - - """) - - # Protein dbref to protein accession mapping - sql_db_sequence_accession_mapping = text(""" - SELECT id, accession - FROM dbsequence - WHERE upload_id IN ( - SELECT u.id - FROM upload u - WHERE u.upload_time = ( - SELECT MAX(upload_time) - FROM upload - WHERE project_id = u.project_id - AND identification_file_name = u.identification_file_name - ) - AND u.project_id = :projectaccession - ); - """) - - project_details = ProjectDetail() - sql_values = {"projectaccession": project_id} - - # get project details from PRIDE API - logger.info("Updating project level metadata") - px_url = 'https://www.ebi.ac.uk/pride/ws/archive/v2/projects/' + project_id - logger.debug('GET request to PRIDE API: ' + px_url) - pride_response = requests.get(px_url) - r = requests.get(px_url) - if r.status_code == 200: - logger.info('PRIDE API returned status code 200') - pride_json = pride_response.json() - if pride_json is not None: - if len(pride_json['references']) > 0: - project_details.pubmed_id = pride_json['references'][0]['pubmedId'] - if len(pride_json['title']) > 0: - project_details.title = pride_json['title'] - if len(pride_json['projectDescription']) > 0: - project_details.description = pride_json['projectDescription'] - if len(pride_json['organisms']) > 0: - project_details.organism = pride_json['organisms'][0]['name'] - - project_details.project_id = project_id - - project_details.number_of_spectra = await get_number_of_counts(sql_number_of_identifications, sql_values, - session) - project_details.number_of_peptides = await get_number_of_counts(sql_number_of_peptides, sql_values, session) - project_details.number_of_proteins = await get_number_of_counts(sql_number_of_proteins, sql_values, session) - - peptide_counts_by_protein = await get_counts_table(sql_peptides_per_protein, sql_values, session) - peptide_crosslinks_by_protein = await get_counts_table(sql_crosslinks_per_protein, sql_values, session) - db_sequence_accession_mapping = await get_counts_table(sql_db_sequence_accession_mapping, sql_values, - session) - - list_of_project_sub_details = [] - - # fill number of peptides - for protein in peptide_counts_by_protein: - project_sub_detail = ProjectSubDetail() - project_sub_detail.project_detail = project_details - project_sub_detail.protein_db_ref = protein['key'] - project_sub_detail.number_of_peptides = protein['value'] - list_of_project_sub_details.append(project_sub_detail) - - # fill number of crosslink - for crosslinks in peptide_crosslinks_by_protein: - for sub_details in list_of_project_sub_details: - if sub_details.protein_db_ref == crosslinks['key']: - sub_details.number_of_cross_links = crosslinks['value'] - - # fill protein accessions - for sub_details in list_of_project_sub_details: - for dbseq in db_sequence_accession_mapping: - if sub_details.protein_db_ref == dbseq['key']: - sub_details.protein_accession = dbseq['value'] - - logger.info("Updating protein level metadata") - await update_protein_metadata(list_of_project_sub_details) - - logger.info("Saving medatadata...") - conditions = {'project_id': project_id} - existing_record = session.query(ProjectDetail).filter_by(**conditions).first() - - # If the record exists, update its attributes - if existing_record: - # Delete ProjectDetail and associated ProjectSubDetail records based on project_detail_id - session.query(ProjectSubDetail).filter_by(project_detail_id=existing_record.id).delete() - session.query(ProjectDetail).filter_by(**conditions).delete() - session.commit() - - # add new record - session.add(project_details) - session.commit() - logger.info("Saving medatadata COMPLETED") - return 0 - - -@pride_router.post("/update-metadata", tags=["Admin"]) -async def update_metadata(session: Session = Depends(get_session), api_key: str = Security(get_api_key)): - """ - An endpoint to update the project details including title, description, PubmedID, - Number of proteins, peptides and spectra identifications - :param api_key: API KEY - :param session: session connection to the database - :return: None - """ - - sql_project_accession_list = text(""" - SELECT DISTINCT u.project_id FROM upload u - """) - - try: - sql_values = {} - list_of_project_id = await get_accessions(sql_project_accession_list, sql_values, session) - for project_id in list_of_project_id: - await update_metadata_by_project(project_id, session, api_key) - session.close() - except Exception as error: - logger.error(error) - session.rollback() - - -@pride_router.put("/log/{level}", tags=["Admin"]) -def change_log_level(level, api_key: str = Security(get_api_key)): - level_upper = str(level).upper() - logging.getLogger("uvicorn.error").setLevel(level_upper) - logging.getLogger("uvicorn.access").setLevel(level_upper) - logging.getLogger("uvicorn.asgi").setLevel(level_upper) - logger.setLevel(level_upper) - - -@pride_router.delete("/delete/{project_id}", tags=["Admin"]) -async def delete_dataset(project_id: str, session: Session = Depends(get_session), - api_key: str = Security(get_api_key)): - logger.info("Deleting dataset: " + project_id) - - try: - # Define the conditions for updating - conditions = {'project_id': project_id} - - # Query for an existing record based on conditions - existing_upload_record = session.query(Upload).filter_by(**conditions).first() - - # If the record exists, update its attributes - if existing_upload_record: - # Delete ProjectDetail and associated ProjectSubDetail records based on project_detail_id - session.query(ProjectSubDetail).filter_by(project_detail_id=existing_upload_record.id).delete() - session.query(ProjectDetail).filter_by(project_id=project_id).delete() - session.query(SpectrumIdentification).filter_by(upload_id=existing_upload_record.id).delete() - session.query(SearchModification).filter_by(upload_id=existing_upload_record.id).delete() - session.query(Enzyme).filter_by(upload_id=existing_upload_record.id).delete() - session.query(SpectrumIdentificationProtocol).filter_by(upload_id=existing_upload_record.id).delete() - session.query(ModifiedPeptide).filter_by(upload_id=existing_upload_record.id).delete() - session.query(DBSequence).filter_by(upload_id=existing_upload_record.id).delete() - session.query(Spectrum).filter_by(upload_id=existing_upload_record.id).delete() - session.query(PeptideEvidence).filter_by(upload_id=existing_upload_record.id).delete() - session.query(AnalysisCollection).filter_by(upload_id=existing_upload_record.id).delete() - session.query(Upload).filter_by(**conditions).delete() - session.commit() - except Exception as error: - logger.error(str(error)) - session.rollback() - finally: - # This is the same as the `get_db` method below - session.close() - - -@pride_router.get("/projects", tags=["Projects"]) -async def list_all_projects(session: Session = Depends(get_session), page: int = 1, page_size: int = 10) -> list[ - ProjectDetail]: - """ - This gives the high-level view of list of projects - :param session: connection to database - :param page: page number - :param page_size: number of records per page - :return: List of ProjectDetails in JSON format - """ - try: - offset = (page - 1) * page_size - projects = session.query(ProjectDetail).offset(offset).limit(page_size).all() - except Exception as e: - # Handle the exception here - logging.error(f"Error occurred: {str(e)}") - return [] - if projects is None or projects == []: - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Projects not found") - - return projects - - -@pride_router.get("/projects/{project_id}", tags=["Projects"]) -def project_detail_view(project_id: str, session: Session = Depends(get_session)) -> List[ProjectDetail]: - """ - Retrieve project detail by px_accession. - :param project_id: identifier of a project, for ProteomeXchange projects this is the PXD****** accession - :param session: - :return: - """ - try: - project_detail = session.query(ProjectDetail) \ - .options(joinedload(ProjectDetail.project_sub_details)) \ - .filter(ProjectDetail.project_id == project_id) \ - .all() - except Exception as e: - logging.error(f"Error occurred: {str(e)}") - project_detail = None - - if project_detail is None or project_detail == []: - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Project detail not found") - - return project_detail - - -@pride_router.get("/projects/{project_id}/proteins", tags=["Projects"]) -def project_detail_view(project_id: str, session: Session = Depends(get_session), page: int = 1, page_size: int = 10) -> \ - list[ProjectSubDetail]: - """ - Retrieve protein detail by px_accession. - """ - try: - project_detail = session.query(ProjectDetail) \ - .options(joinedload(ProjectDetail.project_sub_details)) \ - .filter(ProjectDetail.project_id == project_id).scalar() - - offset = (page - 1) * page_size - project_sub_details = session.query(ProjectSubDetail).filter( - ProjectSubDetail.project_detail_id == project_detail.id) \ - .offset(offset).limit(page_size).all() - - except Exception as e: - logging.error(f"Error occurred: {str(e)}") - project_sub_details = None - - if project_sub_details is None or project_sub_details == []: - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Project detail not found") - - return project_sub_details - - -@pride_router.get("/statistics-count", tags=["Statistics"]) -async def statistics_count(session: Session = Depends(get_session)): - try: - sql_statistics_count = text(""" - SELECT - COUNT(id) AS "Number of Projects", - SUM(number_of_proteins) AS "Number of proteins", - SUM(number_of_peptides) AS "Number of peptides", - SUM(number_of_spectra) AS "Number of spectra", - COUNT(DISTINCT organism) AS "Number of species" - FROM - projectdetails p; - """) - values = await get_statistics_count(sql_statistics_count, session) - except Exception as error: - logger.error(error) - return values - - -@pride_router.get("/projects-per-species", tags=["Statistics"]) -async def project_per_species(session: Session = Depends(get_session)): - """ - Number of projects per species - :param session: session connection to the database - :return: Number of projects per species as a Dictionary - """ - try: - sql_projects_per_species = text(""" - SELECT organism, COUNT(organism) AS organism_count - FROM projectdetails - GROUP BY organism - ORDER BY COUNT(organism) ASC; -""") - values = await project_per_species_counts(sql_projects_per_species, None, session) - except Exception as error: - logger.error(error) - return values - - -@pride_router.get("/peptide-per-protein", tags=["Statistics"]) -async def peptide_per_protein(session: Session = Depends(get_session)): - """ - Get the number of peptides per protein frequency - :param session: session connection to the database - :return: Number of peptides per protein frequency as a dictionary - """ - try: - sql_peptides_per_protein = text(""" - WITH frequencytable AS ( - WITH result AS ( - SELECT - pe1.dbsequence_ref AS dbref1, - pe1.peptide_ref AS pepref1, - pe2.dbsequence_ref AS dbref2, - pe2.peptide_ref AS pepref2 - FROM - spectrumidentification si - INNER JOIN modifiedpeptide mp1 ON si.pep1_id = mp1.id AND si.upload_id = mp1.upload_id - INNER JOIN peptideevidence pe1 ON mp1.id = pe1.peptide_ref AND mp1.upload_id = pe1.upload_id - INNER JOIN modifiedpeptide mp2 ON si.pep2_id = mp2.id AND si.upload_id = mp2.upload_id - INNER JOIN peptideevidence pe2 ON mp2.id = pe2.peptide_ref AND mp2.upload_id = pe2.upload_id - INNER JOIN upload u ON u.id = si.upload_id - WHERE - u.id IN ( - SELECT - u.id - FROM - upload u - WHERE - u.upload_time = ( - SELECT - MAX(upload_time) - FROM - upload - WHERE - project_id = u.project_id - AND identification_file_name = u.identification_file_name - ) - ) - AND pe1.is_decoy = FALSE - AND pe2.is_decoy = FALSE - AND si.pass_threshold = TRUE - ) - SELECT - dbref, - COUNT(pepref) AS peptide_count - FROM - ( - SELECT - dbref1 AS dbref, - pepref1 AS pepref - FROM - result - UNION - SELECT - dbref2 AS dbref, - pepref2 AS pepref - FROM - result - ) AS inner_result - GROUP BY - dbref -) -SELECT - frequencytable.peptide_count, - COUNT(*) -FROM - frequencytable -GROUP BY - frequencytable.peptide_count -ORDER BY - frequencytable.peptide_count; - -""") - values = await peptide_per_protein_counts(sql_peptides_per_protein, None, session) - except Exception as error: - logger.error(error) - return values - - -async def update_protein_metadata(list_of_project_sub_details): - # 1. metadata from Uniprot - logger.info("Updating protein level metadata from Uniprot API...") - uniprot_records = await find_uniprot_data(list_of_project_sub_details) - list_of_project_sub_details = await extract_uniprot_data(list_of_project_sub_details, uniprot_records) - logger.info("Updating protein level metadata from Uniprot API COMPLETED") - - # 2. metadata from PDBe - logger.info("Updating protein level metadata from PDBe API...") - base_in_URL = "https://www.ebi.ac.uk/pdbe/api/mappings/best_structures/" - list_of_project_sub_details = await find_data_availability(list_of_project_sub_details, base_in_URL, "PDBe") - logger.info("Updating protein level metadata from PDBe API COMPLETED") - - # 3. metadata from AlphaFold - logger.info("Updating protein level metadata from AlphaFold API...") - base_in_URL = "https://alphafold.ebi.ac.uk/api/prediction/" - list_of_project_sub_details = await find_data_availability(list_of_project_sub_details, base_in_URL, "AlphaFold") - logger.info("Updating protein level metadata from AlphaFold API COMPLETED") - logger.info("Updating protein level metadata COMPLETED 100%") - return list_of_project_sub_details - - -async def find_uniprot_data(list_of_project_sub_details): - i = 1 - batch_size = 4 - logging.info("Uniprot Batch size: " + str(batch_size)) - base_in_URL = "https://rest.uniprot.org/uniprotkb/search?query=accession:" - fields_in_URL = "&fields=protein_name,gene_primary&size=50" - seperator = "%20OR%20" - accessions = [] - uniprot_records = [] - for sub_details in list_of_project_sub_details: - accessions.append(sub_details.protein_accession) - # batch size or last one in the list - if i % batch_size == 0 or i == len(list_of_project_sub_details): - complete_URL = "" - try: - # extra check not to submit plenty of accessions - if len(accessions) <= batch_size: - accessions_in_URL = seperator.join(accessions) - complete_URL = base_in_URL + accessions_in_URL + fields_in_URL - logging.info("Calling Uniprot API: " + complete_URL) - uniprot_response = requests.get(complete_URL).json() - if uniprot_response is not None: - logging.info("Number of results found for the query: " + str(len(uniprot_response["results"]))) - for result in uniprot_response["results"]: - uniprot_records.append(result) - else: - raise Exception("Number of accessions are greater than the batch size!") - except Exception as error: - logger.error(str(error)) - logger.error(complete_URL + " failed to get data from Uniprot:" + str(error)) - finally: - accessions = [] - i += 1 - return uniprot_records - - -async def extract_uniprot_data(list_of_project_sub_details, uniprot_records): - for sub_details in list_of_project_sub_details: - for uniprot_result in uniprot_records: - try: - if sub_details.protein_accession == uniprot_result["primaryAccession"]: - if not uniprot_result["entryType"] == "Inactive": - if uniprot_result["proteinDescription"]["recommendedName"] is not None: - sub_details.protein_name = \ - uniprot_result["proteinDescription"]["recommendedName"]["fullName"]["value"] - elif uniprot_result["proteinDescription"]["submissionNames"] is not None \ - and len(uniprot_result["proteinDescription"]["submissionNames"]) > 0: - sub_details.protein_name = \ - uniprot_result["proteinDescription"]["submissionNames"][0]["fullName"]["value"] - logger.debug(uniprot_result["primaryAccession"] + " protein name: " + sub_details.protein_name) - if uniprot_result["genes"] is None or len(uniprot_result["genes"]) == 0: - logger.error("\t" + sub_details.protein_accession + " has no genes section") - elif len(uniprot_result["genes"]) > 0: - sub_details.gene_name = uniprot_result["genes"][0]["geneName"]["value"] - logger.debug(uniprot_result["primaryAccession"] + " gene name : " + sub_details.gene_name) - else: - raise Exception("Error in matching genes section of uniprot response") - else: - logger.warn(uniprot_result["primaryAccession"] + "is Inactive") - except Exception as error: - logger.error(str(error)) - logger.error( - sub_details.protein_accession + " has an error when trying to match uniprot response:" + str(error)) - return list_of_project_sub_details - - -async def find_data_availability(list_of_project_sub_details, base_in_URL, resourse): - for sub_details in list_of_project_sub_details: - try: - accession_in_URL = sub_details.protein_accession - complete_URL = base_in_URL + accession_in_URL - logging.debug("Calling API: " + complete_URL) - response = requests.get(complete_URL).json() - if resourse == "PDBe": - sub_details.in_pdbe_kb = response is not None and len(response) > 0 - elif resourse == "AlphaFold": - sub_details.in_alpha_fold_db = response is not None and len(response) > 0 - except Exception as error: - logger.error(str(error)) - logger.error(complete_URL + " failed to get data from Uniprot:" + str(error)) - return list_of_project_sub_details - - -async def get_number_of_counts(sql, sql_values, session): - """ - Get total number of counts (protein, peptide, spectra Identification) for a given project - :param sql: sql statements for counts - :param sql_values: Values for SQl(i.e Project accession or protein accession) - :param session: database session - :return: total number of counts - """ - number_of_counts = 0 - try: - result = session.execute(sql, sql_values) - number_of_counts = result.scalar() - except Exception as error: - logger.error(error) - finally: - session.close() - logger.debug('Database session is closed.') - return number_of_counts - - -async def get_accessions(sql, sql_values, session): - """ - Get all the Unique accessions(project, protein) in the database according to the SQL - :param sql_values: SQl Values - :param sql: SQL to get project accessions - :param session: database session - :return: List of unique project accessions - """ - try: - result = session.execute(sql, sql_values) - # Fetch the list of accessions - list_of_accessions = [row[0] for row in result] - except Exception as error: - logger.error(error) - finally: - session.close() - logger.debug('Database session is closed.') - return list_of_accessions - - -async def get_counts_table(sql, sql_values, session): - """ - Get table of data in the database according to the SQL - :param sql_values: SQl Values - :param sql: SQL to get project accessions - :param session: database session - :return: List of key value pairs - """ - try: - with session: - result = session.execute(sql, sql_values) - result_list = [ - {'key': row[0], 'value': row[1]} for row in result if len(row) >= 2 - ] - except Exception as error: - logger.error(f"Error type: {type(error)}, Error message: {str(error)}") - finally: - logger.debug('Database session is closed.') - return result_list - - -async def project_per_species_counts(sql, sql_values, session): - """ - Get table of data in the database according to the SQL - :param sql_values: SQl Values - :param sql: SQL to get project accessions - :param session: database session - :return: List of key value pairs - """ - try: - with session: - result = session.execute(sql, sql_values) - result_list = [ - {'organism': row[0], 'count': row[1]} for row in result if len(row) >= 2 - ] - except Exception as error: - logger.error(f"Error type: {type(error)}, Error message: {str(error)}") - finally: - logger.debug('Database session is closed.') - return result_list - - -async def peptide_per_protein_counts(sql, sql_values, session): - """ - Get table of data in the database according to the SQL - :param sql_values: SQl Values - :param sql: SQL to get project accessions - :param session: database session - :return: List of key value pairs - """ - try: - with session: - result = session.execute(sql, sql_values) - result_list = [ - {'protein_frequency': row[0], 'peptide_count': row[1]} for row in result if len(row) >= 2 - ] - except Exception as error: - logger.error(f"Error type: {type(error)}, Error message: {str(error)}") - finally: - logger.debug('Database session is closed.') - return result_list - - -async def get_statistics_count(sql, session): - """ - Get all the Unique accessions(project, protein) in the database according to the SQL - :param sql: SQL to get project accessions - :param session: database session - :return: List of unique project accessions - """ - values = None - try: - result = session.execute(sql) - # Fetch the values from the result - row = result.fetchone() - - statistics_counts = {'Number of Projects': row[0], - 'Number of proteins': row[1], - 'Number of peptides': row[2], - 'Number of spectra': row[3], - 'Number of species': row[4]} - print(statistics_counts) - except Exception as error: - logger.error(error) - finally: - session.close() - logger.debug('Database session is closed.') - return statistics_counts - - -async def get_projects_count(sql, session): - """ - Get project counts for health check - :param sql: SQL to get project accessions - :param session: database session - :return: count - """ - try: - result = session.execute(sql) - # Fetch the values from the result - count = result.fetchone() - except Exception as error: - logger.error(error) - finally: - session.close() - logger.debug('Database session is closed.') - return count - diff --git a/app/routes/shared.py b/app/routes/shared.py deleted file mode 100644 index 56f2a1e7..00000000 --- a/app/routes/shared.py +++ /dev/null @@ -1,91 +0,0 @@ -import os -import re -from configparser import ConfigParser - -import psycopg2 - -async def get_most_recent_upload_ids(pxid, file=None): - - """ - Get the most recent upload ids for a project/file. - - :param pxid: identifier of a project, - for ProteomeXchange projects this is the PXD****** accession - :param file: name of the file - :return: upload ids - """ - - conn = None - upload_ids = None - try: - # connect to the PostgreSQL server - # logger.info('Connecting to the PostgreSQL database...') - conn = await get_db_connection() - cur = conn.cursor() - if file: - filename_clean = re.sub(r'[^0-9a-zA-Z-]+', '-', file) - query = """SELECT id FROM upload - WHERE project_id = %s AND identification_file_name_clean = %s - ORDER BY upload_time DESC LIMIT 1;""" - # logger.debug(sql) - cur.execute(query, [pxid, filename_clean]) - - upload_ids = [cur.fetchone()[0]] - if upload_ids is None: - return None # jsonify({"error": "No data found"}), 404 - # logger.info("finished") - # close the communication with the PostgreSQL - cur.close() - else: - query = """SELECT u.id - FROM upload u - where u.upload_time = - (select max(upload_time) from upload - where project_id = u.project_id - and identification_file_name = u.identification_file_name ) - and u.project_id = %s;""" - # logger.debug(sql) - cur.execute(query, [pxid]) - upload_ids = cur.fetchall() - if upload_ids is None: - return None # jsonify({"error": "No data found"}), 404 - # logger.info("finished") - # close the communication with the PostgreSQL - cur.close() - - except (Exception, psycopg2.DatabaseError) as e: - print(e) - finally: - if conn is not None: - conn.close() - print('Database connection closed.') - - return upload_ids - - -async def get_db_connection(): - config = os.environ.get('DB_CONFIG', 'database.ini') - - # https://www.postgresqltutorial.com/postgresql-python/connect/ - async def parse_database_info(filename, section='postgresql'): - # create a parser - parser = ConfigParser() - # read config file - parser.read(filename) - - # get section, default to postgresql - db = {} - if parser.has_section(section): - params = parser.items(section) - for param in params: - db[param[0]] = param[1] - else: - raise Exception('Section {0} not found in the {1} file'.format(section, filename)) - - return db - - # read connection information - db_info = await parse_database_info(config) - # logger.debug('Getting DB connection...') - conn = psycopg2.connect(**db_info) - return conn diff --git a/app/routes/xiview.py b/app/routes/xiview.py deleted file mode 100644 index dd0d8f99..00000000 --- a/app/routes/xiview.py +++ /dev/null @@ -1,267 +0,0 @@ -import json -import logging -import logging.config -import struct - -import psycopg2 -from fastapi import APIRouter -from psycopg2 import sql -from psycopg2.extras import RealDictCursor - -from app.routes.shared import get_db_connection, get_most_recent_upload_ids - -xiview_data_router = APIRouter() - -logger = logging.getLogger(__name__) - - -@xiview_data_router.get('/get_xiview_data', tags=["xiVIEW"]) -async def get_xiview_data(project, file=None): - """ - Get the data for the network visualisation. - URLs have following structure: - https: // www.ebi.ac.uk / pride / archive / xiview / network.html?project=PXD020453&file=Cullin_SDA_1pcFDR.mzid - Users may provide only projects, meaning we need to have an aggregated view. - https: // www.ebi.ac.uk / pride / archive / xiview / network.html?project=PXD020453 - - :return: json with the data - """ - most_recent_upload_ids = await get_most_recent_upload_ids(project, file) - try: - data_object = await get_data_object(most_recent_upload_ids, project) - except psycopg2.DatabaseError as e: - logger.error(e) - print(e) - return {"error": "Database error"}, 500 - - return data_object - -@xiview_data_router.get('/get_peaklist', tags=["xiVIEW"]) -async def get_peaklist(id, sd_ref, upload_id): - conn = None - data = {} - error = None - try: - conn = await get_db_connection() - cur = conn.cursor() - query = "SELECT intensity, mz FROM spectrum WHERE id = %s AND spectra_data_ref = %s AND upload_id = %s" - cur.execute(query, [id, sd_ref, upload_id]) - resultset = cur.fetchall()[0] - data["intensity"] = struct.unpack('%sd' % (len(resultset[0]) // 8), resultset[0]) - data["mz"] = struct.unpack('%sd' % (len(resultset[1]) // 8), resultset[1]) - cur.close() - except (Exception, psycopg2.DatabaseError) as e: - # logger.error(error) - error = e - finally: - if conn is not None: - conn.close() - # logger.debug('Database connection closed.') - if error is not None: - raise error - return data - - -async def get_data_object(ids, pxid): - """ Connect to the PostgreSQL database server """ - conn = None - data = {} - error = None - try: - conn = await get_db_connection() - cur = conn.cursor(cursor_factory=RealDictCursor) - data["project"] = await get_pride_api_info(cur, pxid) - data["meta"] = await get_results_metadata(cur, ids) - data["matches"] = await get_matches(cur, ids) - data["peptides"] = await get_peptides(cur, data["matches"]) - data["proteins"] = await get_proteins(cur, data["peptides"]) - logger.info("finished") - cur.close() - except (Exception, psycopg2.DatabaseError) as e: - error = e - logger.error(e) - raise e - finally: - if conn is not None: - conn.close() - logger.debug('Database connection closed.') - if error is not None: - raise error - return data - -async def get_pride_api_info(cur, pxid): - """ Get the PRIDE API info for the projects """ - query = """SELECT p.id AS id, - p.id, - p.title, - p.description - FROM projectdetails p - WHERE p.project_id = (%s);""" - cur.execute(query, [pxid]) - return cur.fetchall() - -async def get_results_metadata(cur, ids): - """ Get the metadata for the results """ - metadata = {} - - # get Upload(s) for each id - query = """SELECT u.id AS id, - u.project_id, - u.identification_file_name, - u.provider, - u.audit_collection, - u.analysis_sample_collection, - u.bib, - u.spectra_formats, - u.contains_crosslinks, - u.upload_warnings AS warnings - FROM upload u - WHERE u.id = ANY(%s);""" - cur.execute(query, [ids]) - metadata["mzidentml_files"] = cur.fetchall() - - # get AnalysisCollection(s) for each id - query = """SELECT ac.upload_id, - ac.spectrum_identification_list_ref, - ac.spectrum_identification_protocol_ref, - ac.spectra_data_ref - FROM analysiscollection ac - WHERE ac.upload_id = ANY(%s);""" - cur.execute(query, [ids]) - metadata["analysis_collections"] = cur.fetchall() - - # get SpectrumIdentificationProtocol(s) for each id - query = """SELECT sip.id AS id, - sip.upload_id, - sip.frag_tol, - sip.frag_tol_unit, - sip.additional_search_params, - sip.analysis_software, - sip.threshold - FROM spectrumidentificationprotocol sip - WHERE sip.upload_id = ANY(%s);""" - cur.execute(query, [ids]) - metadata["spectrum_identification_protocols"] = cur.fetchall() - - # enzymes - query = """SELECT * - FROM enzyme e - WHERE e.upload_id = ANY(%s);""" - cur.execute(query, [ids]) - metadata["enzymes"] = cur.fetchall() - - # search modifications - try: - query = """SELECT * - FROM searchmodification sm - WHERE sm.upload_id = ANY(%s);""" - cur.execute(query, [ids]) - metadata["search_modifications"] = cur.fetchall() - except Exception as e: - print(e) - - return metadata - - -async def get_matches(cur, ids): - query = """SELECT si.id AS id, si.pep1_id AS pi1, si.pep2_id AS pi2, - si.scores AS sc, - cast (si.upload_id as text) AS si, - si.calc_mz AS c_mz, - si.charge_state AS pc_c, - si.exp_mz AS pc_mz, - si.spectrum_id AS sp, - si.spectra_data_ref AS sd_ref, - si.pass_threshold AS pass, - si.rank AS r, - si.sil_id AS sil - FROM spectrumidentification si - INNER JOIN modifiedpeptide mp1 ON si.pep1_id = mp1.id AND si.upload_id = mp1.upload_id - INNER JOIN modifiedpeptide mp2 ON si.pep2_id = mp2.id AND si.upload_id = mp2.upload_id - WHERE si.upload_id = ANY(%s) - AND si.pass_threshold = TRUE - AND mp1.link_site1 > 0 - AND mp2.link_site1 > 0;""" - # bit weird above works when link_site1 is a text column - cur.execute(query, [ids]) - return cur.fetchall() - - -async def get_peptides(cur, match_rows): - search_peptide_ids = {} - for match_row in match_rows: - if match_row['si'] in search_peptide_ids: - peptide_ids = search_peptide_ids[match_row['si']] - else: - peptide_ids = set() - search_peptide_ids[match_row['si']] = peptide_ids - peptide_ids.add(match_row['pi1']) - if match_row['pi2'] is not None: - peptide_ids.add(match_row['pi2']) - - subclauses = [] - for k, v in search_peptide_ids.items(): - pep_id_literals = [] - for pep_id in v: - pep_id_literals.append(sql.Literal(pep_id)) - joined_pep_ids = sql.SQL(',').join(pep_id_literals) - subclause = sql.SQL("(mp.upload_id = {} AND id IN ({}))").format( - sql.Literal(k), - joined_pep_ids - ) - subclauses.append(subclause) - peptide_clause = sql.SQL(" OR ").join(subclauses) - - query = sql.SQL("""SELECT mp.id, cast(mp.upload_id as text) AS u_id, - mp.base_sequence AS base_seq, - array_agg(pp.dbsequence_ref) AS prt, - array_agg(pp.pep_start) AS pos, - array_agg(pp.is_decoy) AS is_decoy, - mp.link_site1 AS "linkSite", - mp.mod_accessions as mod_accs, - mp.mod_positions as mod_pos, - mp.mod_monoiso_mass_deltas as mod_masses, - mp.crosslinker_modmass as cl_modmass - FROM modifiedpeptide AS mp - JOIN peptideevidence AS pp - ON mp.id = pp.peptide_ref AND mp.upload_id = pp.upload_id - WHERE {} - GROUP BY mp.id, mp.upload_id, mp.base_sequence;""").format( - peptide_clause - ) - # logger.debug(query.as_string(cur)) - cur.execute(query) - return cur.fetchall() - - -async def get_proteins(cur, peptide_rows): - search_protein_ids = {} - for peptide_row in peptide_rows: - if peptide_row['u_id'] in search_protein_ids: - protein_ids = search_protein_ids[peptide_row['u_id']] - else: - protein_ids = set() - search_protein_ids[peptide_row['u_id']] = protein_ids - for prot in peptide_row['prt']: - protein_ids.add(prot) - - subclauses = [] - for k, v in search_protein_ids.items(): - literals = [] - for prot_id in v: - literals.append(sql.Literal(prot_id)) - joined_literals = sql.SQL(",").join(literals) - subclause = sql.SQL("(upload_id = {} AND id IN ({}))").format( - sql.Literal(k), - joined_literals - ) - subclauses.append(subclause) - - protein_clause = sql.SQL(" OR ").join(subclauses) - query = sql.SQL("""SELECT id, name, accession, sequence, - cast(upload_id as text) AS search_id, description FROM dbsequence WHERE ({});""").format( - protein_clause - ) - # logger.debug(query.as_string(cur)) - cur.execute(query) - return cur.fetchall() diff --git a/app/config/__init__.py b/config/__init__.py similarity index 100% rename from app/config/__init__.py rename to config/__init__.py diff --git a/config/config_parser.py b/config/config_parser.py new file mode 100644 index 00000000..ef98564e --- /dev/null +++ b/config/config_parser.py @@ -0,0 +1,55 @@ +from configparser import ConfigParser +import os + + +def parse_config(filename, section='postgresql'): + # create a parser + parser = ConfigParser() + # read config file + parser.read(filename) + + # get section, default to postgresql + configs = {} + if parser.has_section(section): + params = parser.items(section) + for param in params: + configs[param[0]] = param[1] + else: + raise Exception('Section {0} not found in the {1} file'.format(section, filename)) + return configs + + +def get_conn_str(): + """ + Get database related configurations + """ + script_dir = os.path.dirname(os.path.abspath(__file__)) + config = os.environ.get('DB_CONFIG', os.path.join(script_dir, "database.ini")) + db_info = parse_config(config) + hostname = os.environ.get('DB_HOST') or db_info.get("host") + database = os.environ.get('DB_DATABASE_NAME') or db_info.get("database") + username = os.environ.get('DB_USER') or db_info.get("user") + password = os.environ.get('DB_PASSWORD') or db_info.get("password") + port = os.environ.get('DB_PORT') or db_info.get("port") + conn_str = f"postgresql://{username}:{password}@{hostname}:{port}/{database}" + return conn_str + + +# def security_API_key(): +# config = os.environ.get('DB_CONFIG', '../database.ini') +# security_info = parse_config(config, 'security') +# apikey = security_info.get("apikey") +# return apikey + + +def get_api_configs(): + """ + Get API related configurations + """ + script_dir = os.path.dirname(os.path.abspath(__file__)) + config = os.environ.get('DB_CONFIG', os.path.join(script_dir, "database.ini")) + api_configs = parse_config(config, "api") + config= {"base_url": os.environ.get('BASE_URL') or api_configs.get("base_url"), + "api_key": os.environ.get('API_KEY') or api_configs.get("api_key"), + "api_key_value": os.environ.get('API_KEY_VALUE') or api_configs.get("api_key_value")} + return config diff --git a/app/config/database.py b/config/database.py similarity index 87% rename from app/config/database.py rename to config/database.py index 5a913a1d..4ab0f2ff 100644 --- a/app/config/database.py +++ b/config/database.py @@ -1,6 +1,6 @@ from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker -from db_config_parser import get_conn_str +from config_parser import get_conn_str conn_str = get_conn_str() diff --git a/index.py b/config/index.py similarity index 78% rename from index.py rename to config/index.py index 19441c0d..b9929b74 100644 --- a/index.py +++ b/config/index.py @@ -1,4 +1,4 @@ -from app.config.database import SessionLocal +from config.database import SessionLocal # Helper function to get database session diff --git a/logging.ini b/config/logging.ini similarity index 89% rename from logging.ini rename to config/logging.ini index 6da719c9..aebca283 100644 --- a/logging.ini +++ b/config/logging.ini @@ -21,7 +21,7 @@ args=(sys.stdout,) class=handlers.TimedRotatingFileHandler level=DEBUG formatter=fileFormatter -args=('logs/xi_mzidentml_converter.log', 'midnight', 1, 30, 'utf-8') +args=('../logs/xi_mzidentml_converter.log', 'midnight', 1, 30, 'utf-8') [formatter_consoleFormatter] format=%(asctime)s - %(name)s - %(levelname)s - %(message)s diff --git a/db_config_parser.py b/db_config_parser.py deleted file mode 100644 index 59e152d3..00000000 --- a/db_config_parser.py +++ /dev/null @@ -1,38 +0,0 @@ -from configparser import ConfigParser -import os - - -def parse_info(filename, section='postgresql'): - # create a parser - parser = ConfigParser() - # read config file - parser.read(filename) - - # get section, default to postgresql - section_info = {} - if parser.has_section(section): - params = parser.items(section) - for param in params: - section_info[param[0]] = param[1] - else: - raise Exception('Section {0} not found in the {1} file'.format(section, filename)) - return section_info - - -def get_conn_str(): - config = os.environ.get('DB_CONFIG', 'database.ini') - db_info = parse_info(config, 'postgresql') - hostname = db_info.get("host") - database = db_info.get("database") - username = db_info.get("user") - password = db_info.get("password") - port = db_info.get("port") - conn_str = f"postgresql://{username}:{password}@{hostname}:{port}/{database}" - return conn_str - - -def security_API_key(): - config = os.environ.get('DB_CONFIG', 'database.ini') - security_info = parse_info(config, 'security') - apikey = security_info.get("apikey") - return apikey diff --git a/dbs/saved/.gitkeep b/dbs/saved/.gitkeep deleted file mode 100755 index 8d1c8b69..00000000 --- a/dbs/saved/.gitkeep +++ /dev/null @@ -1 +0,0 @@ - diff --git a/dbs/tmp/.gitkeep b/dbs/tmp/.gitkeep deleted file mode 100755 index 8b137891..00000000 --- a/dbs/tmp/.gitkeep +++ /dev/null @@ -1 +0,0 @@ - diff --git a/default.database.ini b/default.database.ini index a53a97cb..19b7f3eb 100644 --- a/default.database.ini +++ b/default.database.ini @@ -6,4 +6,5 @@ password=$DB_PASSWORD port=$DB_PORT [security] -apikey=$API_KEY \ No newline at end of file +apikey=$API_KEY +base_url=$BASE_URL \ No newline at end of file diff --git a/environment.yml b/environment.yml new file mode 100644 index 00000000..98b4215b --- /dev/null +++ b/environment.yml @@ -0,0 +1,26 @@ +# conda env create -f environment.yml +name: xiview-mzidentml-converter +channels: + - defaults +dependencies: + - python=3.10 + - pip + - pip: + - fastapi>=0.68.0,<0.69.0 + - uvicorn<0.16.0,>=0.15.0 + - lxml>=4.9.1 + - numpy>=1.14.3 + - pandas>=0.21.0 + - pymzml>=0.7.8 + - pyteomics>=3.4.2 + - requests>=2.20.1 + - urllib3>=1.24.2 + - pytest + - psycopg2-binary + - sqlalchemy==2.0.21 + - sqlalchemy-utils + - obonet + - python-multipart + - python-jose + - passlib + - jose diff --git a/main.py b/main.py deleted file mode 100644 index 01ac7ec1..00000000 --- a/main.py +++ /dev/null @@ -1,11 +0,0 @@ -import uvicorn - -from app.api import app - -if __name__ == "__main__": - uvicorn.run(app, - host="0.0.0.0", - port=3000, - log_level="info", # Set the desired log level ("debug", "info", "warning", etc.) - log_config="logging.ini" # Specify the path to your logging configuration file - ) diff --git a/app/models/__init__.py b/models/__init__.py similarity index 100% rename from app/models/__init__.py rename to models/__init__.py diff --git a/app/models/analysiscollection.py b/models/analysiscollection.py similarity index 96% rename from app/models/analysiscollection.py rename to models/analysiscollection.py index 29d9511d..f22c076b 100644 --- a/app/models/analysiscollection.py +++ b/models/analysiscollection.py @@ -1,6 +1,6 @@ from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy import ForeignKey, Text, ForeignKeyConstraint, Integer -from app.models.base import Base +from models.base import Base class AnalysisCollection(Base): diff --git a/app/models/base.py b/models/base.py similarity index 100% rename from app/models/base.py rename to models/base.py diff --git a/app/models/dbsequence.py b/models/dbsequence.py similarity index 94% rename from app/models/dbsequence.py rename to models/dbsequence.py index ebd4f095..377a0d0d 100644 --- a/app/models/dbsequence.py +++ b/models/dbsequence.py @@ -1,6 +1,6 @@ from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy import ForeignKey, Text, Integer -from app.models.base import Base +from models.base import Base class DBSequence(Base): diff --git a/app/models/enzyme.py b/models/enzyme.py similarity index 89% rename from app/models/enzyme.py rename to models/enzyme.py index a194df7c..44402806 100644 --- a/app/models/enzyme.py +++ b/models/enzyme.py @@ -1,6 +1,6 @@ from sqlalchemy.orm import Mapped, mapped_column -from sqlalchemy import ForeignKey, Text, Integer, BOOLEAN, ForeignKeyConstraint, Integer -from app.models.base import Base +from sqlalchemy import ForeignKey, Text, BOOLEAN, ForeignKeyConstraint, Integer +from models.base import Base class Enzyme(Base): diff --git a/app/models/index.py b/models/index.py similarity index 100% rename from app/models/index.py rename to models/index.py diff --git a/app/models/modifiedpeptide.py b/models/modifiedpeptide.py similarity index 97% rename from app/models/modifiedpeptide.py rename to models/modifiedpeptide.py index e3b90356..22972087 100644 --- a/app/models/modifiedpeptide.py +++ b/models/modifiedpeptide.py @@ -1,6 +1,6 @@ from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy import ForeignKey, Text, Integer, JSON, FLOAT -from app.models.base import Base +from models.base import Base from typing import Optional, Any class ModifiedPeptide(Base): diff --git a/app/models/peptideevidence.py b/models/peptideevidence.py similarity index 96% rename from app/models/peptideevidence.py rename to models/peptideevidence.py index fb09082a..2e937433 100644 --- a/app/models/peptideevidence.py +++ b/models/peptideevidence.py @@ -1,6 +1,6 @@ from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy import ForeignKey, Text, Integer, BOOLEAN, ForeignKeyConstraint -from app.models.base import Base +from models.base import Base class PeptideEvidence(Base): diff --git a/app/models/projectdetail.py b/models/projectdetail.py similarity index 92% rename from app/models/projectdetail.py rename to models/projectdetail.py index 300ff4f5..10b456a9 100644 --- a/app/models/projectdetail.py +++ b/models/projectdetail.py @@ -1,6 +1,6 @@ from sqlalchemy.orm import Mapped, mapped_column, relationship -from sqlalchemy import ForeignKey, Text, Integer -from app.models.base import Base +from sqlalchemy import Text, Integer +from models.base import Base class ProjectDetail(Base): diff --git a/app/models/projectsubdetail.py b/models/projectsubdetail.py similarity index 97% rename from app/models/projectsubdetail.py rename to models/projectsubdetail.py index f6dc8209..c20f34b9 100644 --- a/app/models/projectsubdetail.py +++ b/models/projectsubdetail.py @@ -1,6 +1,6 @@ from sqlalchemy.orm import Mapped, mapped_column, relationship from sqlalchemy import ForeignKey, Text, Integer, Boolean -from app.models.base import Base +from models.base import Base class ProjectSubDetail(Base): diff --git a/app/models/searchmodification.py b/models/searchmodification.py similarity index 97% rename from app/models/searchmodification.py rename to models/searchmodification.py index 8e61ef1f..3f5c812a 100644 --- a/app/models/searchmodification.py +++ b/models/searchmodification.py @@ -1,6 +1,6 @@ from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy import ForeignKey, Text, BOOLEAN, BIGINT, FLOAT, JSON, ForeignKeyConstraint, Integer -from app.models.base import Base +from models.base import Base from typing import Any diff --git a/app/models/spectrum.py b/models/spectrum.py similarity index 96% rename from app/models/spectrum.py rename to models/spectrum.py index 3aafb97f..27fdc9fd 100644 --- a/app/models/spectrum.py +++ b/models/spectrum.py @@ -1,6 +1,6 @@ from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy import ForeignKey, Text, LargeBinary, SMALLINT, FLOAT, Integer -from app.models.base import Base +from models.base import Base class Spectrum(Base): diff --git a/app/models/spectrumidentification.py b/models/spectrumidentification.py similarity index 98% rename from app/models/spectrumidentification.py rename to models/spectrumidentification.py index 6b38c494..fec350f1 100644 --- a/app/models/spectrumidentification.py +++ b/models/spectrumidentification.py @@ -1,6 +1,6 @@ from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy import ForeignKey, Text, FLOAT, JSON, BOOLEAN, Integer, ForeignKeyConstraint -from app.models.base import Base +from models.base import Base from typing import Optional, Any diff --git a/app/models/spectrumidentificationprotocol.py b/models/spectrumidentificationprotocol.py similarity index 96% rename from app/models/spectrumidentificationprotocol.py rename to models/spectrumidentificationprotocol.py index 270d5505..4819fb9e 100644 --- a/app/models/spectrumidentificationprotocol.py +++ b/models/spectrumidentificationprotocol.py @@ -1,6 +1,6 @@ from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy import ForeignKey, Text, JSON, Integer, Float -from app.models.base import Base +from models.base import Base from typing import Optional, Any diff --git a/app/models/upload.py b/models/upload.py similarity index 88% rename from app/models/upload.py rename to models/upload.py index b0b5327b..433be294 100644 --- a/app/models/upload.py +++ b/models/upload.py @@ -1,6 +1,6 @@ -from sqlalchemy.orm import Mapped, mapped_column, relationship -from sqlalchemy import ForeignKey, Text, JSON, BOOLEAN, TIMESTAMP, func, Integer -from app.models.base import Base +from sqlalchemy.orm import Mapped, mapped_column +from sqlalchemy import Text, JSON, BOOLEAN, TIMESTAMP, func, Integer +from models.base import Base from typing import Optional, Any import datetime diff --git a/parser/MzIdParser.py b/parser/MzIdParser.py index a43fd8a0..d47739d6 100644 --- a/parser/MzIdParser.py +++ b/parser/MzIdParser.py @@ -1,9 +1,11 @@ +import base64 import gzip import json import ntpath import os import re import struct +import traceback import zipfile from time import time @@ -13,6 +15,8 @@ from sqlalchemy import Table from sqlalchemy.exc import SQLAlchemyError +from parser import writer +from parser.api_writer import APIWriter from parser.peaklistReader.PeakListWrapper import PeakListWrapper @@ -589,6 +593,10 @@ def main_loop(self): mz_blob = struct.pack(f'{len(mz_blob)}d', *mz_blob) intensity_blob = spectrum.int_values.tolist() intensity_blob = struct.pack(f'{len(intensity_blob)}d', *intensity_blob) + # Encode binary data using base64 to enable transmitting in API call and then decode in API + if isinstance(self.writer, APIWriter): + mz_blob = base64.b64encode(mz_blob).decode('utf-8') + intensity_blob = base64.b64encode(intensity_blob).decode('utf-8') spectra.append({ 'id': sid_result["spectrumID"], @@ -672,7 +680,8 @@ def main_loop(self): self.writer.write_data('spectrumidentification', spectrum_identifications) spectrum_identifications = [] except Exception as e: - raise e + print(f"Caught an exception while writing data: {e}") + traceback.print_exc() # end main loop self.logger.info('main loop - done Time: {} sec'.format( @@ -738,7 +747,7 @@ def upload_info(self): bib_refs.append(bib) self.mzid_reader.reset() - self.writer.write_mzid_info(analysis_software_list, spectra_formats, provider, audits, samples, bib_refs) + self.writer.write_mzid_info(analysis_software_list, spectra_formats, provider, audits, samples, bib_refs, self.writer.upload_id) self.logger.info('getting upload info - done Time: {} sec'.format( round(time() - upload_info_start_time, 2))) @@ -748,27 +757,26 @@ def fill_in_missing_scores(self): def write_new_upload(self): """Write new upload.""" - filename = os.path.basename(self.mzid_path) - upload_data = { - 'identification_file_name': filename, - 'project_id': self.writer.pxid, - 'identification_file_name_clean': re.sub(r'[^0-9a-zA-Z-]+', '-', filename) - } - # self.writer.write_data('Upload', upload_data) try: - table = Table('upload', self.writer.meta, autoload_with=self.writer.engine, quote=False) - with self.writer.engine.connect() as conn: - statement = table.insert().values(upload_data).returning(table.columns[0]) # RETURNING id AS upload_id - result = conn.execute(statement) - conn.commit() - self.writer.upload_id = result.fetchall()[0][0] - conn.close() + filename = os.path.basename(self.mzid_path) + upload_data = { + 'identification_file_name': filename, + 'project_id': self.writer.pxid, + 'identification_file_name_clean': re.sub(r'[^0-9a-zA-Z-]+', '-', filename) + } + table = 'upload' + + response = self.writer.write_new_upload(table, upload_data) + if response: + self.writer.upload_id =int(response) + else: + raise Exception("Response is not available to create a upload ID") except SQLAlchemyError as e: print(f"Error during database insert: {e}") def write_other_info(self): """Write remaining information into Upload table.""" - self.writer.write_other_info(self.contains_crosslinks, list(self.warnings)) + self.writer.write_other_info(self.contains_crosslinks, list(self.warnings), self.writer.upload_id) def get_cv_params(self, element, super_cls_accession=None): """ diff --git a/parser/api_writer.py b/parser/api_writer.py new file mode 100644 index 00000000..75ef70b0 --- /dev/null +++ b/parser/api_writer.py @@ -0,0 +1,160 @@ +import traceback +import requests +import json + +from config.config_parser import get_api_configs +from parser.writer import Writer + + +class APIWriter(Writer): + """Class for writing results to a relational database.""" + + def __init__(self, connection_str, user_id=None, upload_id=None, pxid=None): + self.pxid = pxid + self.upload_id = upload_id + configs = get_api_configs() + self.base_url = configs['base_url'] + self.api_key = configs['api_key'] + self.api_key_value = configs['api_key_value'] + + def write_data(self, table, data): + response = None + try: + API_ENDPOINT = self.base_url+ "/write_data" + API_KEY_VALUE = self.api_key_value + API_KEY = self.api_key + headers = {'Content-Type': 'application/json', API_KEY: API_KEY_VALUE} + payload = { + "table": table, + "data": data, + } + # Calculate the size of the payload + payload_size = len(json.dumps(payload)) + print("Payload Size:", payload_size) # Print the payload size + response = requests.post(url=API_ENDPOINT, headers=headers, json=payload) + response.raise_for_status() + + # Check the response status code and handle it as needed + if response.status_code == 200: + print("Request successful:" + API_ENDPOINT) + else: + print(f"Unexpected status code: {response.status_code}") + except Exception as e: + print(f"Caught an exception: {e}") + # print(payload) + traceback.print_exc() + if response is not None: + return response.json() + else: + return None + + + def write_new_upload(self, table, data): + response = None + + try: + API_ENDPOINT = self.base_url + "/write_new_upload" + API_KEY_VALUE = self.api_key_value + API_KEY = self.api_key + headers = {'Content-Type': 'application/json', API_KEY: API_KEY_VALUE} + payload = { + "table": table, + "data": data, + } + # Calculate the size of the payload + payload_size = len(json.dumps(payload)) + print("write_new_upload Payload Size:", payload_size) # Print the payload size + response = requests.post(url=API_ENDPOINT, headers=headers, json=payload) + response.raise_for_status() + + # Check the response status code and handle it as needed + if response.status_code == 200: + print("Request successful") + else: + print(f"Unexpected status code: {response.status_code}") + print(response.json()) + except Exception as e: + print(f"Caught an exception: {e}") + traceback.print_exc() + if response is not None: + return response.json() + else: + return None + + def write_mzid_info(self, analysis_software_list, spectra_formats, + provider, audits, samples, bib, upload_id): + response = None + try: + API_ENDPOINT = self.base_url + "/write_mzid_info?upload_id="+str(upload_id) + API_KEY_VALUE = self.api_key_value + API_KEY = self.api_key + headers = {'Content-Type': 'application/json', API_KEY: API_KEY_VALUE} + payload = { + "analysis_software_list": analysis_software_list, + "spectra_formats": spectra_formats, + "provider": provider, + "audits": audits, + "samples": samples, + "bib": bib, + } + # Calculate the size of the payload + payload_size = len(json.dumps(payload)) + print("write_mzid_info Payload Size:", payload_size) # Print the payload size + response = requests.post(url=API_ENDPOINT, headers=headers, json=payload) + response.raise_for_status() + result = response.json() + + # Check the response status code and handle it as needed + if response.status_code == 200: + print("Request successful") + print(result) + else: + print(f"Unexpected status code: {response.status_code}") + + print(result) + except Exception as e: + print(f"Caught an exception: {e}") + traceback.print_exc() + if response is not None: + return response.json() + else: + return None + + + def write_other_info(self, contains_crosslinks, upload_warnings, upload_id): + response = None + try: + #todo: use urljoin + API_ENDPOINT = self.base_url + "/write_other_info?upload_id="+str(upload_id) + API_KEY_VALUE = self.api_key_value + API_KEY = self.api_key + headers = {'Content-Type': 'application/json', API_KEY: API_KEY_VALUE} + payload = { + "contains_crosslinks": contains_crosslinks, + "upload_warnings": upload_warnings, + } + response = requests.post(url=API_ENDPOINT, headers=headers, json=payload) + response.raise_for_status() + result = response.json() + + # Check the response status code and handle it as needed + if response.status_code == 200: + print("Request successful") + print(result) + else: + print(f"Unexpected status code: {response.status_code}") + + print(result) + except Exception as e: + print(f"Caught an exception: {e}") + traceback.print_exc() + if response is not None: + return response.json() + else: + return None + + def fill_in_missing_scores(self): + """ + ToDo: this needs to be adapted to sqlalchemy from old SQLite version + """ + pass \ No newline at end of file diff --git a/create_db_schema.py b/parser/database/create_db_schema.py similarity index 88% rename from create_db_schema.py rename to parser/database/create_db_schema.py index 8e3c155b..9d7f95dd 100644 --- a/create_db_schema.py +++ b/parser/database/create_db_schema.py @@ -3,8 +3,7 @@ from sqlalchemy import create_engine from sqlalchemy_utils import database_exists, drop_database, create_database -from app.models.base import Base -from app.models import * +from models.base import Base def create_db(connection_str): @@ -27,7 +26,7 @@ def create_schema(connection_str): if __name__ == "__main__": try: - from db_config_parser import get_conn_str + from config.config_parser import get_conn_str except ModuleNotFoundError: raise ModuleNotFoundError( 'Database credentials missing! ' diff --git a/parser/database_writer.py b/parser/database_writer.py new file mode 100644 index 00000000..b3c607c8 --- /dev/null +++ b/parser/database_writer.py @@ -0,0 +1,99 @@ +from sqlalchemy import create_engine, MetaData +from sqlalchemy import Table +from parser.database.create_db_schema import create_schema +from sqlalchemy_utils import database_exists + + +class DatabaseWriter: + """Class for writing results to a relational database.""" + + def __init__(self, connection_str, user_id=None, upload_id=None, pxid=None): + """ + Initialises the database connection and the writer in general. + + :param connection_str: database connection string + :param user_id: UUID of the UserAccount (postgresql specific) + """ + # Connection setup. + # The 'engine' in SQLAlchemy is a Factory and connection pool to the database. + # It has lazy initialisation. + self.engine = create_engine(connection_str) + self.meta = MetaData() + self.pxid = pxid + self.upload_id = upload_id + # Create table schema if necessary (SQLite) - not working for postgresql - why? + if not database_exists(self.engine.url): + create_schema(self.engine.url) + + def write_data(self, table, data): + """ + Insert data into table. + + :param table: (str) Table name + :param data: (list dict) data to insert. + """ + table = Table(table, self.meta, autoload_with=self.engine) + with self.engine.connect() as conn: + statement = table.insert().values(data) + conn.execute(statement) + conn.commit() + conn.close() + + def write_new_upload(self, table, data): + table = Table(table, self.meta, autoload_with=self.engine, quote=False) + with self.engine.connect() as conn: + statement = table.insert().values(data).returning(table.columns[0]) # RETURNING id AS upload_id + result = conn.execute(statement) + conn.commit() + conn.close() + return result.fetchall()[0][0] + + def write_mzid_info(self, analysis_software_list, spectra_formats, + provider, audits, samples, bib, upload_id): + """ + Update Upload row with mzid info. + + ToDo: have this explicitly or create update func? + :param spectra_formats: + :param provider: + :param audits: + :param samples: + :param bib: + :return: + """ + upload = Table("upload", self.meta, autoload_with=self.engine, quote=False) + stmt = upload.update().where(upload.c.id == str(upload_id)).values( + analysis_software_list=analysis_software_list, + spectra_formats=spectra_formats, + provider=provider, + audit_collection=audits, + analysis_sample_collection=samples, + bib=bib + ) + with self.engine.connect() as conn: + conn.execute(stmt) + conn.commit() + + def write_other_info(self, contains_crosslinks, upload_warnings, upload_id): + """ + Update Upload row with remaining info. + + ToDo: have this explicitly or create update func? + :param contains_crosslinks: + :param upload_warnings: + :return: + """ + upload = Table("upload", self.meta, autoload_with=self.engine, quote=False) + with self.engine.connect() as conn: + stmt = upload.update().where(upload.c.id == str(upload_id)).values( + contains_crosslinks=contains_crosslinks, + upload_warnings=upload_warnings, + ) + conn.execute(stmt) + conn.commit() + + def fill_in_missing_scores(self): + """ + ToDo: this needs to be adapted to sqlalchemy from old SQLite version + """ + pass \ No newline at end of file diff --git a/process_dataset.py b/parser/process_dataset.py similarity index 76% rename from process_dataset.py rename to parser/process_dataset.py index 0c9244b7..df91ead5 100644 --- a/process_dataset.py +++ b/parser/process_dataset.py @@ -13,40 +13,74 @@ from urllib.parse import urlparse from parser.MzIdParser import MzIdParser -from parser.writer import Writer -from db_config_parser import get_conn_str import logging.config +from parser.api_writer import APIWriter +from config.config_parser import get_conn_str +from parser.database_writer import DatabaseWriter -logging.config.fileConfig("logging.ini") +logging_config_file = os.path.join(os.path.dirname(__file__), '../config/logging.ini') +logging.config.fileConfig(logging_config_file) logger = logging.getLogger(__name__) -def main(args): - if args.temp: - temp_dir = os.path.expanduser(args.temp) - else: - temp_dir = os.path.expanduser('~/mzId_convertor_temp') - - if args.pxid: - px_accessions = args.pxid - for px_accession in px_accessions: - # convert_pxd_accession(px_accession, temp_dir, args.dontdelete) - convert_pxd_accession_from_pride(px_accession, temp_dir, args.dontdelete) - elif args.ftp: - ftp_url = args.ftp - if args.identifier: - project_identifier = args.identifier +def main(): + parser = argparse.ArgumentParser( + description='Process mzIdentML files in a dataset and load them into a relational database.') + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument('-p', '--pxid', nargs='+', + help='proteomeXchange accession, should be of the form PXDnnnnnn or numbers only', ) + group.add_argument('-f', '--ftp', + help='process files from specified ftp location, e.g. ftp://ftp.jpostdb.org/JPST001914/') + group.add_argument('-d', '--dir', + help='process files in specified local directory, e.g. /home/user/data/JPST001914') + parser.add_argument('-i', '--identifier', + help='identifier to use for dataset (if providing ' + 'proteome exchange accession these are always used instead and this arg is ignored)') + parser.add_argument('--dontdelete', action='store_true', help='Do not delete downloaded data after processing') + parser.add_argument('-t', '--temp', action='store_true', help='Temp folder to download data files into') + parser.add_argument('-n', '--nopeaklist', + help='No peak list files available, only works in comination with --dir arg', + action='store_true') + parser.add_argument('-w', '--writer', help='Save data to database(-w db) or API(-w api)') + args = parser.parse_args() + try: + logger.info("process_dataset.py is running!") + print("process_dataset.py is running!") + if args.temp: + temp_dir = os.path.expanduser(args.temp) else: - parsed_url = urlparse(ftp_url) - project_identifier = parsed_url.path.rsplit("/", 1)[-1] - convert_from_ftp(ftp_url, temp_dir, project_identifier, args.dontdelete) - else: - local_dir = args.dir - if args.identifier: - project_identifier = args.identifier + temp_dir = os.path.expanduser('~/mzId_convertor_temp') + + if args.writer: + writer_method = args.writer + if not (writer_method.lower() == 'api' or writer_method.lower() == 'db'): + raise ValueError('Writer method not supported! please use "api" or "database"') + + if args.pxid: + px_accessions = args.pxid + for px_accession in px_accessions: + # convert_pxd_accession(px_accession, temp_dir, args.dontdelete) + convert_pxd_accession_from_pride(px_accession, temp_dir, writer_method, args.dontdelete) + elif args.ftp: + ftp_url = args.ftp + if args.identifier: + project_identifier = args.identifier + else: + parsed_url = urlparse(ftp_url) + project_identifier = parsed_url.path.rsplit("/", 1)[-1] + convert_from_ftp(ftp_url, temp_dir, project_identifier, writer_method, args.dontdelete) else: - project_identifier = local_dir.rsplit("/", 1)[-1] - convert_dir(local_dir, project_identifier, nopeaklist=args.nopeaklist) + local_dir = args.dir + if args.identifier: + project_identifier = args.identifier + else: + project_identifier = local_dir.rsplit("/", 1)[-1] + convert_dir(local_dir, project_identifier, writer_method, nopeaklist=args.nopeaklist) + sys.exit(0) + except Exception as ex: + logger.error(ex) + traceback.print_stack(ex) + sys.exit(1) def convert_pxd_accession(px_accession, temp_dir, dont_delete=False): @@ -71,7 +105,7 @@ def convert_pxd_accession(px_accession, temp_dir, dont_delete=False): raise Exception('Error: ProteomeXchange returned status code ' + str(px_response.status_code)) -def convert_pxd_accession_from_pride(px_accession, temp_dir, dont_delete=False): +def convert_pxd_accession_from_pride(px_accession, temp_dir, writer_method, dont_delete=False): # get ftp location from PRIDE API px_url = 'https://www.ebi.ac.uk/pride/ws/archive/v2/files/byProject?accession=' + px_accession logger.info('GET request to PRIDE API: ' + px_url) @@ -101,14 +135,14 @@ def convert_pxd_accession_from_pride(px_accession, temp_dir, dont_delete=False): logger.info('PRIDE FTP path : ' + parent_folder) break; - convert_from_ftp(ftp_url, temp_dir, px_accession, dont_delete) + convert_from_ftp(ftp_url, temp_dir, px_accession,writer_method, dont_delete) if not ftp_url: raise Exception('Error: Public File location not found in PRIDE API response') else: raise Exception('Error: PRIDE API returned status code ' + str(pride_response.status_code)) -def convert_from_ftp(ftp_url, temp_dir, project_identifier, dont_delete): +def convert_from_ftp(ftp_url, temp_dir, project_identifier, writer_method, dont_delete): if not ftp_url.startswith('ftp://'): raise Exception('Error: FTP location must start with ftp://') if not os.path.isdir(temp_dir): @@ -147,7 +181,7 @@ def convert_from_ftp(ftp_url, temp_dir, project_identifier, dont_delete): # error_msg = "%s: %s" % (f, e.args[0]) # self.logger.error(error_msg) raise e - convert_dir(path, project_identifier) + convert_dir(path, project_identifier, writer_method) if not dont_delete: # remove downloaded files try: @@ -191,7 +225,7 @@ def get_ftp_file_list(ftp_ip, ftp_dir): return filelist -def convert_dir(local_dir, project_identifier, nopeaklist=False): +def convert_dir(local_dir, project_identifier,writer_method, nopeaklist=False): # logging.basicConfig(level=logging.DEBUG, # format='%(asctime)s %(levelname)s %(name)s %(message)s') # logger = logging.getLogger(__name__) @@ -201,7 +235,10 @@ def convert_dir(local_dir, project_identifier, nopeaklist=False): if file.endswith(".mzid") or file.endswith(".mzid.gz"): logger.info("Processing " + file) conn_str = get_conn_str() - writer = Writer(conn_str, pxid=project_identifier) + if writer_method.lower() == 'api': + writer = APIWriter(conn_str, pxid=project_identifier) + elif writer_method.lower() == 'db': + writer = DatabaseWriter(conn_str, pxid=project_identifier) id_parser = MzIdParser(os.path.join(local_dir, file), local_dir, peaklist_dir, writer, logger) try: id_parser.parse() @@ -216,28 +253,4 @@ def convert_dir(local_dir, project_identifier, nopeaklist=False): if __name__ == "__main__": - parser = argparse.ArgumentParser( - description='Process mzIdentML files in a dataset and load them into a relational database.') - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument('-p', '--pxid', nargs='+', - help='proteomeXchange accession, should be of the form PXDnnnnnn or numbers only', ) - group.add_argument('-f', '--ftp', - help='process files from specified ftp location, e.g. ftp://ftp.jpostdb.org/JPST001914/') - group.add_argument('-d', '--dir', - help='process files in specified local directory, e.g. /home/user/data/JPST001914') - parser.add_argument('-i', '--identifier', - help='identifier to use for dataset (if providing ' - 'proteome exchange accession these are always used instead and this arg is ignored)') - parser.add_argument('--dontdelete', action='store_true', help='Do not delete downloaded data after processing') - parser.add_argument('-t', '--temp', action='store_true', help='Temp folder to download data files into') - parser.add_argument('-n', '--nopeaklist', - help='No peak list files available, only works in comination with --dir arg', - action='store_true') - try: - logger.info("process_dataset.py is running!") - main(parser.parse_args()) - sys.exit(0) - except Exception as ex: - logger.error(ex) - traceback.print_stack(ex) - sys.exit(1) + main() diff --git a/parser/writer.py b/parser/writer.py index 740dbb9e..46e09851 100644 --- a/parser/writer.py +++ b/parser/writer.py @@ -1,124 +1,25 @@ -from sqlalchemy import create_engine, MetaData -from sqlalchemy import Table -from create_db_schema import create_schema -from sqlalchemy_utils import database_exists +from abc import ABC, abstractmethod - -class Writer: - """Class for writing results to a relational database.""" +## Strategy interface +class Writer(ABC): def __init__(self, connection_str, user_id=None, upload_id=None, pxid=None): - """ - Initialises the database connection and the writer in general. - - :param connection_str: database connection string - :param user_id: UUID of the UserAccount (postgresql specific) - """ - # Connection setup. - # The 'engine' in SQLAlchemy is a Factory and connection pool to the database. - # It has lazy initialisation. - self.engine = create_engine(connection_str) - self.meta = MetaData() self.pxid = pxid - # Create table schema if necessary (SQLite) - not working for postgresql - why? - if not database_exists(self.engine.url): - create_schema(self.engine.url) + self.upload_id = upload_id + @abstractmethod def write_data(self, table, data): - """ - Insert data into table. + pass - :param table: (str) Table name - :param data: (list dict) data to insert. - """ - table = Table(table, self.meta, autoload_with=self.engine) - with self.engine.connect() as conn: - statement = table.insert().values(data) - conn.execute(statement) - conn.commit() - conn.close() + @abstractmethod + def write_new_upload(self, table, data): + pass + @abstractmethod def write_mzid_info(self, analysis_software_list, spectra_formats, - provider, audits, samples, bib): - """ - Update Upload row with mzid info. - - ToDo: have this explicitly or create update func? - :param spectra_formats: - :param provider: - :param audits: - :param samples: - :param bib: - :return: - """ - upload = Table("upload", self.meta, autoload_with=self.engine, quote=False) - stmt = upload.update().where(upload.c.id == str(self.upload_id)).values( - analysis_software_list=analysis_software_list, - spectra_formats=spectra_formats, - provider=provider, - audit_collection=audits, - analysis_sample_collection=samples, - bib=bib - ) - with self.engine.connect() as conn: - conn.execute(stmt) - conn.commit() - - def write_other_info(self, contains_crosslinks, upload_warnings): - """ - Update Upload row with remaining info. - - ToDo: have this explicitly or create update func? - :param contains_crosslinks: - :param upload_warnings: - :return: - """ - upload = Table("upload", self.meta, autoload_with=self.engine, quote=False) - with self.engine.connect() as conn: - stmt = upload.update().where(upload.c.id == str(self.upload_id)).values( - contains_crosslinks=contains_crosslinks, - upload_warnings=upload_warnings, - ) - conn.execute(stmt) - conn.commit() + provider, audits, samples, bib, upload_id): + pass + @abstractmethod def fill_in_missing_scores(self): - """ - ToDo: this needs to be adapted to sqlalchemy from old SQLite version - """ - pass - # try: - # cur.execute(""" - # SELECT DISTINCT scoresJSON.key as scoreKey - # FROM spectrum_identifications, json_each(spectrum_identifications.scores) - # AS scoresJSON""") - # - # all_scores = cur.fetchall() - # all_scores = set([str(x[0]) for x in all_scores]) - # - # inj_list = [] - # - # cur.execute('SELECT id, scores FROM spectrum_identifications') - # res = cur.fetchall() - # - # for row in res: - # row_scores = json.loads(row[1]) - # missing = all_scores - set(row_scores.keys()) - # - # if len(missing) > 0: - # missing_dict = {key: -1 for key in missing} - # row_scores.update(missing_dict) - # inj_list.append([json.dumps(row_scores), row[0]]) - # # cur.execute('UPDATE identifications SET allScores=? WHERE id = row[0]', - # json.dumps(row_scores)) - # - # cur.executemany(""" - # UPDATE spectrum_identifications - # SET `scores` = ? - # WHERE `id` = ?""", inj_list) - # - # con.commit() - # - # except sqlite3.Error as e: - # raise DBException(e.message) - # pass + pass \ No newline at end of file diff --git a/pyproject.toml.bak b/pyproject.toml.bak new file mode 100644 index 00000000..e6cc1ea9 --- /dev/null +++ b/pyproject.toml.bak @@ -0,0 +1,36 @@ +[project] +name = "xi-mzidentml-converter" +version = "0.1.2" +description = "xi-mzidentml-converter uses pyteomics (https://pyteomics.readthedocs.io/en/latest/index.html) to parse mzIdentML files (v1.2.0) and extract crosslink information. Results are written to a relational database (PostgreSQL or SQLite) using sqlalchemy." +readme = "README.md" +dependencies = [ +'python >= 3.10', +'fastapi>=0.68.0,<0.69.0', +'uvicorn<=0.16.0,>=0.15.0', +'lxml>=4.9.1', +'numpy>=1.14.3', +'pandas>=0.21.0', +'pymzml>=0.7.8', +'pyteomics>=3.4.2', +'requests>=2.20.1', +'urllib3>=1.24.2', +'pytest', +'psycopg2-binary', +'sqlalchemy==2.0.21', +'sqlalchemy-utils', +'obonet', +'python-multipart', +'python-jose', +'passlib', +'jose' + ] + +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project.scripts] +process = "process:main" + +[tool.setuptools.packages] +find = {} \ No newline at end of file diff --git a/scripts/runDataset.sh b/scripts/runDataset.sh new file mode 100644 index 00000000..1ade05d3 --- /dev/null +++ b/scripts/runDataset.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash + +# Load environment (and make the slrum command available) +. /etc/profile.d/slurm.sh + +#This job resets one document(accession based) from mongodb + +##### OPTIONS +# (required) the project accession +PROJECT_ACCESSION="" + +##### VARIABLES +# the name to give to the LSF job (to be extended with additional info) +JOB_NAME="xi-mzidentml-converter" +# memory limit +MEMORY_LIMIT=8G +#email notification +JOB_EMAIL="pride-report@ebi.ac.uk" + +##### FUNCTIONS +printUsage() { + echo "Description: This will parse cross-linking dataset" + echo "$ ./scripts/runDataset.sh" + echo "" + echo "Usage: ./runDataset.sh -a|--accession [-e|--email]" + echo " Example: ./xi-mzidentml-converter.sh -a PXD036833 --dontdelete -w api" + echo " (required) accession : the project accession of a crosslinking dataset" + echo " (optional) email : Email to send LSF notification" +} + +##### PARSE the provided parameters +while [ "$1" != "" ]; do + case $1 in + "-a" | "--accession") + shift + PROJECT_ACCESSION=$1 + ;; + esac + shift +done + +##### CHECK the provided arguments +if [ -z ${PROJECT_ACCESSION} ]; then + echo "Need to enter a project accession" + printUsage + exit 1 +fi + +DATE=$(date +"%Y%m%d%H%M") +LOG_FILE_NAME="${JOB_NAME}/${PROJECT_ACCESSION}-${DATE}.log" +LOG_FILE_ERROR_NAME="${JOB_NAME}/${PROJECT_ACCESSION}-${DATE}-error.log" +JOB_NAME="${JOB_NAME}-${PROJECT_ACCESSION}" + + +##### Change directory to where the script locate +cd ${0%/*} + +##### Activate conda environment +source /hps/software/users/juan/pride/anaconda3/etc/profile.d/conda.sh +conda activate xi-mzidentml-converter + +#### RUN it on the cluster ##### +sbatch -t 7-0 \ + --mem=${MEMORY_LIMIT} \ + --partition datamover \ + --mail-type=ALL \ + --mail-user=${JOB_EMAIL} \ + --job-name=${JOB_NAME} \ + --output ${LOG_FILE_NAME} \ + --error ${LOG_FILE_ERROR_NAME} \ + --wrap="process_dataset -p ${PROJECT_ACCESSION} --dontdelete -w api" diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..cb43a725 --- /dev/null +++ b/setup.py @@ -0,0 +1,65 @@ +import codecs +import os.path + +from setuptools import find_packages +from setuptools import setup + +with open("README.md", encoding="UTF-8") as fh: + long_description = fh.read() + + +def read(rel_path): + here = os.path.abspath(os.path.dirname(__file__)) + with codecs.open(os.path.join(here, rel_path), "r") as fp: + return fp.read() + + +def get_version(rel_path): + for line in read(rel_path).splitlines(): + if line.startswith("__version__"): + delim = '"' if '"' in line else "'" + return line.split(delim)[1] + raise RuntimeError("Unable to find version string.") + + +setup( + name="xi-mzidentml-converter", + version="0.1.19", + description="xi-mzidentml-converter uses pyteomics (https://pyteomics.readthedocs.io/en/latest/index.html) to parse mzIdentML files (v1.2.0) and extract crosslink information. Results are written to a relational database (PostgreSQL or SQLite) using sqlalchemy.", + long_description_content_type="text/markdown", + long_description=long_description, + license="'Apache 2.0", + url="https://github.com/PRIDE-Archive/xi-mzidentml-converter", + packages=find_packages(), + package_data={'config': ['logging.ini','database.ini']}, + install_requires=[ + 'fastapi>=0.68.0,<0.69.0', + 'uvicorn<=0.16.0,>=0.15.0', + 'lxml>=4.9.1', + 'numpy>=1.14.3', + 'pandas>=0.21.0', + 'pymzml>=0.7.8', + 'pyteomics>=3.4.2', + 'requests>=2.20.1', + 'urllib3>=1.24.2', + 'pytest', + 'psycopg2-binary', + 'sqlalchemy==2.0.21', + 'sqlalchemy-utils', + 'obonet', + 'python-multipart', + 'python-jose', + 'passlib', + 'jose' + ], + entry_points={"console_scripts": ["process_dataset = parser.process_dataset:main"]}, + platforms=["any"], + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Topic :: Scientific/Engineering :: Bio-Informatics", + ], + keywords="crosslinking python proteomics", + python_requires=">=3.10", +) \ No newline at end of file diff --git a/tests/db_pytest_fixtures.py b/tests/db_pytest_fixtures.py index 0dac692b..c0e5c9f8 100644 --- a/tests/db_pytest_fixtures.py +++ b/tests/db_pytest_fixtures.py @@ -1,4 +1,4 @@ -from create_db_schema import create_schema, create_db, drop_db +from parser.database.create_db_schema import create_schema, create_db, drop_db import pytest from sqlalchemy import create_engine diff --git a/tests/parse_csv.py b/tests/parse_csv.py index b7120db5..ed2d4551 100644 --- a/tests/parse_csv.py +++ b/tests/parse_csv.py @@ -1,5 +1,5 @@ from parser import FullCsvParser, NoPeakListsCsvParser, LinksOnlyCsvParser -from parser.writer import Writer +from parser.api_writer import Writer from sqlalchemy import text from uuid import uuid4 from .db_pytest_fixtures import * diff --git a/tests/parse_mzid.py b/tests/parse_mzid.py index 7d3e0f12..f6c6ac6a 100644 --- a/tests/parse_mzid.py +++ b/tests/parse_mzid.py @@ -1,5 +1,5 @@ from parser import MzIdParser -from parser.writer import Writer +from parser.api_writer import Writer from sqlalchemy import text from uuid import uuid4 from .db_pytest_fixtures import * diff --git a/tests/test_CsvParsers.py b/tests/test_CsvParsers.py index da7e6b1a..cf538cd6 100644 --- a/tests/test_CsvParsers.py +++ b/tests/test_CsvParsers.py @@ -1,5 +1,5 @@ import os -from parser.writer import Table +from parser.api_writer import Table import logging from parser.peaklistReader.PeakListWrapper import PeakListWrapper from .db_pytest_fixtures import * diff --git a/tests/test_MzIdParser_ecoli_dsso.py b/tests/test_MzIdParser_ecoli_dsso.py index e015da0f..2b8d66b8 100644 --- a/tests/test_MzIdParser_ecoli_dsso.py +++ b/tests/test_MzIdParser_ecoli_dsso.py @@ -4,7 +4,7 @@ """ import numpy as np from numpy.testing import assert_array_equal -from parser.writer import Table +from parser.api_writer import Table import os import logging from sqlalchemy import text diff --git a/tests/test_MzIdParser_matrixscience.py b/tests/test_MzIdParser_matrixscience.py index a5d98188..782b34b2 100644 --- a/tests/test_MzIdParser_matrixscience.py +++ b/tests/test_MzIdParser_matrixscience.py @@ -1,4 +1,4 @@ -from parser.writer import Writer, Table +from parser.api_writer import Writer, Table import os import logging from .db_pytest_fixtures import * diff --git a/tests/test_MzIdParser_matrixscience2.py b/tests/test_MzIdParser_matrixscience2.py index b4692093..54255487 100644 --- a/tests/test_MzIdParser_matrixscience2.py +++ b/tests/test_MzIdParser_matrixscience2.py @@ -1,4 +1,4 @@ -from parser.writer import Writer, Table +from parser.api_writer import Writer, Table import os import logging from .db_pytest_fixtures import * diff --git a/tests/test_fasta_reader.py b/tests/test_fasta_reader.py index 5d9408f7..c490813d 100644 --- a/tests/test_fasta_reader.py +++ b/tests/test_fasta_reader.py @@ -1,7 +1,7 @@ import os from parser import SimpleFASTA -from parser.writer import Table +from parser.api_writer import Table import logging from parser.peaklistReader.PeakListWrapper import PeakListWrapper from .db_pytest_fixtures import *