Merge pull request #68 from geneontology/issue_369_rename_output_GAF

Issue 369 rename output gaf and compress resulting merged file
geneontology · Apr 10, 2024 · 5d17198 · 5d17198
2 parents 8fd2e18 + 82b72cc
commit 5d17198
Show file tree

Hide file tree

Showing 5 changed files with 47 additions and 42 deletions.
diff --git a/.github/workflows/pr-test-qc.yml b/.github/workflows/pr-test-qc.yml
@@ -3,7 +3,7 @@ name: Run tests and QC
 # Controls when the action will run.
 on:
   pull_request:
-    types: [opened, synchronize, reopened]
+    types: [ opened, synchronize, reopened ]
 
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
@@ -17,7 +17,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python: ["3.9", "3.10"]
+        python: [ "3.9", "3.10", "3.11", "3.12" ]
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:

diff --git a/README.md b/README.md
@@ -13,6 +13,24 @@ annotations that can be converted to mouse via orthology, and the mouse annotati
 make run
 ```
 
+`make all` will download the necessary upstream files like the Human & Human Isoform annotations, the Rat annotations,
+the MGI GPI file, the Alliance orthology file, the Mouse Xref file and the Gene Ontology in JSON format. It will then
+run the orthology annotation conversion step for each species, finally combining the resulting GAFs with the Protein2GO
+# gopreprocess
+
+This repo holds parsing and GAF generation code for generating automatic annotations based on orthology, or based on
+upstream products from Protein2GO.
+
+The Makefile helps coordinate the process for a single species, but each step can be run independently.
+
+For example, running this preprocessing pipeline for MGI to create a single GAF 2.2 file that represents
+all the automated annotations that the GO Consortium will consume for mouse (this includes the human and rat
+annotations that can be converted to mouse via orthology, and the mouse annotations created by Protein2GO):
+
+```bash
+make run
+```
+
 `make all` will download the necessary upstream files like the Human & Human Isoform annotations, the Rat annotations,
 the MGI GPI file, the Alliance orthology file, the Mouse Xref file and the Gene Ontology in JSON format. It will then
 run the orthology annotation conversion step for each species, finally combining the resulting GAFs with the Protein2GO
@@ -57,26 +75,22 @@ This process is conducted by the silver-issue-325-gopreprocess GO pipeline repos
 the process is best summarized by the following stages:
 
 1) The Download GAFs stage - this is where the orthologs species files and the Alliance orthology file are downloaded
-   and is executed via the pipeline jenkins file which calls `make download_rat` and `make download_human`
+and is executed via the pipeline jenkins file which calls `make download_rat` and `make download_human`
 
-2) The Generate automated annotations stage - this is where the orthologs species files are converted to mouse
-   annotations
-   and is executed via the pipeline jenkins file which calls `make convert_rat` and `make convert_human`. It also calls
-   `make convert_p2g_annotations` to download the Protein2GO annotations. Finally, it calls `make merge_gafs` to merge
-   the
-   resulting GAFs together and `make validate_merged_gafs` to validate the resulting GAF. If more than 3% of the
-   resulting
-   merged GAF is invalid, the `validate_merged_gafs` target will return a non-zero exit code and the pipeline will fail.
+2)  The Generate automated annotations stage - this is where the orthologs species files are converted to mouse annotations
+and is executed via the pipeline jenkins file which calls `make convert_rat` and `make convert_human`.  It also calls
+`make convert_p2g_annotations` to download the Protein2GO annotations.  Finally, it calls `make merge_gafs` to merge the
+resulting GAFs together and `make validate_merged_gafs` to validate the resulting GAF.  If more than 3% of the resulting
+merged GAF is invalid, the `validate_merged_gafs` target will return a non-zero exit code and the pipeline will fail.
 
 ![](docs/images/MGI_remainders_preprocess_pipeline_workflow.drawio.png)
 
 # run diff code to compare with an existing file
-
 ```bash
 poetry run compare -file1 file_path -file2 file_path -o file_output_prefix
 ```
-
-By default, this will compare subject_id (gene or gene product id), object_id (go term id), and evidence_code between
-the two files
+By default, this will compare subject_id (gene or gene product id), object_id (go term id), and evidence_code between the two files 
 reporting the matching associations, the associations unique to file1 and those unique to file2.
 
+
+
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gopreprocess"
-version = "0.0.0.post330.dev0+2158c39"
+version = "0.0.0"
 description = "gopreprocess"
 authors = ["Sierra Moxon <[email protected]>"]
 license = "MIT"
@@ -36,7 +36,7 @@ validate = "src.gopreprocess.cli:validate"
 tests = ["pytest"]
 
 [tool.poetry-dynamic-versioning]
-enable = false
+enable = true
 vcs = "git"
 style = "pep440"
 

diff --git a/src/utils/generate_gpad.py b/src/utils/generate_gpad.py
@@ -81,7 +81,7 @@ def generate_gpad_file() -> tuple[Path, Path]:
 
     merged_gaf_filepath = pystow.join(
         key="MGI",
-        name="mgi-merged.gaf",
+        name="mgi-p2go-homology.gaf",
         ensure_exists=True,
     )
     gpad_rows = []

diff --git a/src/utils/merge_gafs.py b/src/utils/merge_gafs.py
@@ -1,7 +1,8 @@
 """Utils for merging files together."""
 
 from pystow import join
-
+import gzip
+from pathlib import Path
 from src.utils.settings import taxon_to_provider
 
 
@@ -17,44 +18,36 @@ def merge_files_from_directory(source_directory: str):
     )
 
     target_taxon = "NCBITaxon:10090"
+    target_file_name = taxon_to_provider[target_taxon].lower() + "-p2go-homology.gaf.gz"
 
+    # Construct the target file output path with gzip extension
     target_file_output = join(
         key=taxon_to_provider[target_taxon],
-        name=taxon_to_provider[target_taxon].lower() + "-merged.gaf",
+        name=target_file_name,
         ensure_exists=True,
-    )
-
-    # Ensure the directory exists
-    if not source_directory.exists() or not source_directory.is_dir():
-        raise ValueError(f"{source_directory} is not a valid directory.")
-
-    # Get all .gaf files in the directory
-    gaf_files = list(source_directory.glob("*.gaf"))
+    ).as_posix()  # Use as_posix() to get the path as a string, assuming pystow >= 0.3.0
 
-    # List to store merged headers
+    # Initialize lists to store headers and data lines
     headers = []
-
-    # List to store data lines
     data_lines = []
 
+    # Get all .gaf files in the directory
+    gaf_files = list(Path(source_directory).glob("*.gaf"))
+
     # Process each file
     for file in gaf_files:
         with open(file, "r") as f:
             for line in f:
-                # If the line starts with '!', it's a header line
                 if line.startswith("!"):
                     headers.append(line.strip())
                 else:
                     data_lines.append(line)
 
-    # Deduplicate headers
-    unique_headers = list(set(headers))
+    # Deduplicate and sort headers
+    unique_headers = sorted(set(headers))
 
-    # Sort headers just to ensure consistent ordering
-    unique_headers.sort()
-
-    # Write the merged file
-    with open(target_file_output, "w") as out:
+    # Write the merged file with gzip compression
+    with gzip.open(target_file_output, "wt") as out:  # "wt" mode for writing text in a gzip file
         # Write the merged headers
         for header in unique_headers:
             out.write(header + "\n")
@@ -63,6 +56,4 @@ def merge_files_from_directory(source_directory: str):
         for line in data_lines:
             out.write(line)
 
-    out.close()
-
-    return target_file_output
+    return target_file_output