From 3ae5a950d267b62aacae6c9b07ccadb345dcf756 Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Tue, 9 Apr 2024 15:10:57 -0700 Subject: [PATCH 1/4] minor doc clarification --- README.md | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 9308b20..2bfe9eb 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,24 @@ annotations that can be converted to mouse via orthology, and the mouse annotati make run ``` +`make all` will download the necessary upstream files like the Human & Human Isoform annotations, the Rat annotations, +the MGI GPI file, the Alliance orthology file, the Mouse Xref file and the Gene Ontology in JSON format. It will then +run the orthology annotation conversion step for each species, finally combining the resulting GAFs with the Protein2GO +# gopreprocess + +This repo holds parsing and GAF generation code for generating automatic annotations based on orthology, or based on +upstream products from Protein2GO. + +The Makefile helps coordinate the process for a single species, but each step can be run independently. + +For example, running this preprocessing pipeline for MGI to create a single GAF 2.2 file that represents +all the automated annotations that the GO Consortium will consume for mouse (this includes the human and rat +annotations that can be converted to mouse via orthology, and the mouse annotations created by Protein2GO): + +```bash +make run +``` + `make all` will download the necessary upstream files like the Human & Human Isoform annotations, the Rat annotations, the MGI GPI file, the Alliance orthology file, the Mouse Xref file and the Gene Ontology in JSON format. It will then run the orthology annotation conversion step for each species, finally combining the resulting GAFs with the Protein2GO @@ -57,26 +75,22 @@ This process is conducted by the silver-issue-325-gopreprocess GO pipeline repos the process is best summarized by the following stages: 1) The Download GAFs stage - this is where the orthologs species files and the Alliance orthology file are downloaded - and is executed via the pipeline jenkins file which calls `make download_rat` and `make download_human` +and is executed via the pipeline jenkins file which calls `make download_rat` and `make download_human` -2) The Generate automated annotations stage - this is where the orthologs species files are converted to mouse - annotations - and is executed via the pipeline jenkins file which calls `make convert_rat` and `make convert_human`. It also calls - `make convert_p2g_annotations` to download the Protein2GO annotations. Finally, it calls `make merge_gafs` to merge - the - resulting GAFs together and `make validate_merged_gafs` to validate the resulting GAF. If more than 3% of the - resulting - merged GAF is invalid, the `validate_merged_gafs` target will return a non-zero exit code and the pipeline will fail. +2) The Generate automated annotations stage - this is where the orthologs species files are converted to mouse annotations +and is executed via the pipeline jenkins file which calls `make convert_rat` and `make convert_human`. It also calls +`make convert_p2g_annotations` to download the Protein2GO annotations. Finally, it calls `make merge_gafs` to merge the +resulting GAFs together and `make validate_merged_gafs` to validate the resulting GAF. If more than 3% of the resulting +merged GAF is invalid, the `validate_merged_gafs` target will return a non-zero exit code and the pipeline will fail. ![](docs/images/MGI_remainders_preprocess_pipeline_workflow.drawio.png) # run diff code to compare with an existing file - ```bash poetry run compare -file1 file_path -file2 file_path -o file_output_prefix ``` - -By default, this will compare subject_id (gene or gene product id), object_id (go term id), and evidence_code between -the two files +By default, this will compare subject_id (gene or gene product id), object_id (go term id), and evidence_code between the two files reporting the matching associations, the associations unique to file1 and those unique to file2. + + From fa53bfdb3da6205669907a2c623ee3c62193c4ac Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Tue, 9 Apr 2024 15:26:36 -0700 Subject: [PATCH 2/4] change filename to reflect more specific context --- src/utils/generate_gpad.py | 2 +- src/utils/merge_gafs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/utils/generate_gpad.py b/src/utils/generate_gpad.py index b432ce8..7c8aa2a 100644 --- a/src/utils/generate_gpad.py +++ b/src/utils/generate_gpad.py @@ -81,7 +81,7 @@ def generate_gpad_file() -> tuple[Path, Path]: merged_gaf_filepath = pystow.join( key="MGI", - name="mgi-merged.gaf", + name="mgi-p2go-homology.gaf", ensure_exists=True, ) gpad_rows = [] diff --git a/src/utils/merge_gafs.py b/src/utils/merge_gafs.py index 7e9778e..8ae01e7 100644 --- a/src/utils/merge_gafs.py +++ b/src/utils/merge_gafs.py @@ -20,7 +20,7 @@ def merge_files_from_directory(source_directory: str): target_file_output = join( key=taxon_to_provider[target_taxon], - name=taxon_to_provider[target_taxon].lower() + "-merged.gaf", + name=taxon_to_provider[target_taxon].lower() + "-p2go-homology.gaf", ensure_exists=True, ) From 7a15a8ec93902f01e9f731968d2488a1ca9fb3a9 Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Tue, 9 Apr 2024 16:59:01 -0700 Subject: [PATCH 3/4] produce gzipped version --- src/utils/merge_gafs.py | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/src/utils/merge_gafs.py b/src/utils/merge_gafs.py index 8ae01e7..a50b2ee 100644 --- a/src/utils/merge_gafs.py +++ b/src/utils/merge_gafs.py @@ -1,7 +1,8 @@ """Utils for merging files together.""" from pystow import join - +import gzip +from pathlib import Path from src.utils.settings import taxon_to_provider @@ -17,44 +18,36 @@ def merge_files_from_directory(source_directory: str): ) target_taxon = "NCBITaxon:10090" + target_file_name = taxon_to_provider[target_taxon].lower() + "-p2go-homology.gaf.gz" + # Construct the target file output path with gzip extension target_file_output = join( key=taxon_to_provider[target_taxon], - name=taxon_to_provider[target_taxon].lower() + "-p2go-homology.gaf", + name=target_file_name, ensure_exists=True, - ) - - # Ensure the directory exists - if not source_directory.exists() or not source_directory.is_dir(): - raise ValueError(f"{source_directory} is not a valid directory.") - - # Get all .gaf files in the directory - gaf_files = list(source_directory.glob("*.gaf")) + ).as_posix() # Use as_posix() to get the path as a string, assuming pystow >= 0.3.0 - # List to store merged headers + # Initialize lists to store headers and data lines headers = [] - - # List to store data lines data_lines = [] + # Get all .gaf files in the directory + gaf_files = list(Path(source_directory).glob("*.gaf")) + # Process each file for file in gaf_files: with open(file, "r") as f: for line in f: - # If the line starts with '!', it's a header line if line.startswith("!"): headers.append(line.strip()) else: data_lines.append(line) - # Deduplicate headers - unique_headers = list(set(headers)) + # Deduplicate and sort headers + unique_headers = sorted(set(headers)) - # Sort headers just to ensure consistent ordering - unique_headers.sort() - - # Write the merged file - with open(target_file_output, "w") as out: + # Write the merged file with gzip compression + with gzip.open(target_file_output, "wt") as out: # "wt" mode for writing text in a gzip file # Write the merged headers for header in unique_headers: out.write(header + "\n") @@ -63,6 +56,4 @@ def merge_files_from_directory(source_directory: str): for line in data_lines: out.write(line) - out.close() - - return target_file_output + return target_file_output \ No newline at end of file From 82b72ccc253196530554f8ba9654713880b6836b Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Tue, 9 Apr 2024 17:02:47 -0700 Subject: [PATCH 4/4] enable dynamic versioning again --- .github/workflows/pr-test-qc.yml | 4 ++-- pyproject.toml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pr-test-qc.yml b/.github/workflows/pr-test-qc.yml index eec923b..8e71f37 100644 --- a/.github/workflows/pr-test-qc.yml +++ b/.github/workflows/pr-test-qc.yml @@ -3,7 +3,7 @@ name: Run tests and QC # Controls when the action will run. on: pull_request: - types: [opened, synchronize, reopened] + types: [ opened, synchronize, reopened ] # Allows you to run this workflow manually from the Actions tab workflow_dispatch: @@ -17,7 +17,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python: ["3.9", "3.10"] + python: [ "3.9", "3.10", "3.11", "3.12" ] # Steps represent a sequence of tasks that will be executed as part of the job steps: diff --git a/pyproject.toml b/pyproject.toml index ae2d45e..b80dc74 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gopreprocess" -version = "0.0.0.post330.dev0+2158c39" +version = "0.0.0" description = "gopreprocess" authors = ["Sierra Moxon "] license = "MIT" @@ -36,7 +36,7 @@ validate = "src.gopreprocess.cli:validate" tests = ["pytest"] [tool.poetry-dynamic-versioning] -enable = false +enable = true vcs = "git" style = "pep440"