From 93b7f1fa3d6fea35379a3429e7b2efe00cc1bca3 Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Fri, 8 Nov 2024 15:24:56 +0000 Subject: [PATCH 1/6] refactor to use genes_to_disease.txt in place of disease.pg --- src/phenotype2phenopacket/add/add_genes.py | 22 +++++++++++----------- src/phenotype2phenopacket/cli_add.py | 16 ++++++++-------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/phenotype2phenopacket/add/add_genes.py b/src/phenotype2phenopacket/add/add_genes.py index 27c7724..4611360 100644 --- a/src/phenotype2phenopacket/add/add_genes.py +++ b/src/phenotype2phenopacket/add/add_genes.py @@ -18,24 +18,24 @@ def get_phenotype_to_disease_entries( - omim_disease_pg: pl.DataFrame, disease: Disease + genes_to_disease: pl.DataFrame, disease: Disease ) -> pl.DataFrame: """ Return disease.pg entries that match the provided OMIM disease ID. Args: - omim_disease_pg (pl.DataFrame): DataFrame containing disease.pg entries. + genes_to_disease (pl.DataFrame): DataFrame containing genes_to_disease.txt entries. disease (Disease): Disease object containing the OMIM disease ID. Returns: pl.DataFrame: Filtered DataFrame containing entries matching the OMIM disease ID. """ - return omim_disease_pg.filter(pl.col("database_id") == disease.term.id) + return genes_to_disease.filter(pl.col("disease_id") == disease.term.id) def add_genes( phenopacket_path: Path, - disease_pg: pl.DataFrame, + genes_to_disease: pl.DataFrame, gene_identifier_updater: GeneIdentifierUpdater, output_dir: Path, ): @@ -44,21 +44,21 @@ def add_genes( Args: phenopacket_path (Path): Path to the phenopacket file. - disease_pg (pl.DataFrame): DataFrame containing disease.pg entries. + genes_to_disease (pl.DataFrame): DataFrame containing genes_to_disease.txt entries. gene_identifier_updater (GeneIdentifierUpdater): Object for updating gene identifiers. output_dir (Path): Directory to write the updated phenopacket. """ phenopacket = phenopacket_reader(phenopacket_path) disease = PhenopacketUtil(phenopacket).return_phenopacket_disease() - filtered_disease_pg = get_phenotype_to_disease_entries(disease_pg, disease) - if len(filtered_disease_pg) == 0: + filtered_genes_to_disease = get_phenotype_to_disease_entries(genes_to_disease, disease) + if len(filtered_genes_to_disease) == 0: print(f"No gene-to-phenotype matches: {disease.term.id}, {disease.term.label}") else: phenopacket_with_genes = PhenopacketInterpretationExtender( phenopacket ).add_gene_interpretation_to_phenopacket( - omim_disease_phenotype_gene_map=filtered_disease_pg, + omim_disease_phenotype_gene_map=filtered_genes_to_disease, gene_identifier_updater=gene_identifier_updater, ) ( @@ -68,13 +68,13 @@ def add_genes( ) -def add_genes_to_directory(phenopacket_dir: Path, disease_pg: pl.DataFrame, output_dir: Path): +def add_genes_to_directory(phenopacket_dir: Path, genes_to_disease: pl.DataFrame, output_dir: Path): """ Add known gene-to-phenotype relationships to the interpretations of a directory of phenopackets. Args: phenopacket_dir (Path): Directory containing the phenopacket files. - disease_pg (pl.DataFrame): DataFrame containing disease.pg entries. + genes_to_disease (pl.DataFrame): DataFrame containing genes_to_disease.txt entries. output_dir (Path): Directory to store the updated phenopackets. """ hgnc_dict = create_hgnc_dict() @@ -83,4 +83,4 @@ def add_genes_to_directory(phenopacket_dir: Path, disease_pg: pl.DataFrame, outp gene_identifier="ensembl_id", hgnc_data=hgnc_dict, identifier_map=identifier_map ) for phenopacket_path in all_files(phenopacket_dir): - add_genes(phenopacket_path, disease_pg, gene_identifier_updater, output_dir) + add_genes(phenopacket_path, genes_to_disease, gene_identifier_updater, output_dir) diff --git a/src/phenotype2phenopacket/cli_add.py b/src/phenotype2phenopacket/cli_add.py index 180ad4b..af642ca 100644 --- a/src/phenotype2phenopacket/cli_add.py +++ b/src/phenotype2phenopacket/cli_add.py @@ -3,7 +3,7 @@ import click from phenotype2phenopacket.add.add_genes import add_genes_to_directory -from phenotype2phenopacket.utils.utils import read_disease_pg +from phenotype2phenopacket.utils.utils import read_genes_to_disease @click.command("add-genes") @@ -15,10 +15,10 @@ type=Path, ) @click.option( - "--disease-pg", - "-d", + "--genes-to-disease", + "-g", required=True, - help="Path to disease.pg data file.", + help="Path to genes_to_disease.txt data file.", type=Path, ) @click.option( @@ -30,7 +30,7 @@ ) def add_genes_command( phenopacket_dir: Path, - disease_pg: Path, + genes_to_disease: Path, output_dir: Path, ): """ @@ -38,13 +38,13 @@ def add_genes_command( Args: phenopacket_dir (Path): Directory containing the phenopacket files. - disease_pg (Path): Path to the disease.pg file. + genes_to_disease (Path): Path to the genes_to_disease.txt file. output_dir (Path): Directory to store the updated phenopackets. """ output_dir.mkdir(exist_ok=True) - disease_pg_df = read_disease_pg(disease_pg) + genes_to_disease_df = read_genes_to_disease(genes_to_disease) add_genes_to_directory( phenopacket_dir, - disease_pg_df, + genes_to_disease_df, output_dir, ) From 369fa88736d1d999cf7b6f47bbef90767bff60fc Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Fri, 8 Nov 2024 15:25:14 +0000 Subject: [PATCH 2/6] refactor to use genes_to_disease.txt in place of disease.pg --- .../utils/phenopacket_utils.py | 44 ++++++------- src/phenotype2phenopacket/utils/utils.py | 63 +++++++++++-------- tests/test_phenopacket_utils.py | 33 +++------- 3 files changed, 66 insertions(+), 74 deletions(-) diff --git a/src/phenotype2phenopacket/utils/phenopacket_utils.py b/src/phenotype2phenopacket/utils/phenopacket_utils.py index 51b6d2b..540e258 100644 --- a/src/phenotype2phenopacket/utils/phenopacket_utils.py +++ b/src/phenotype2phenopacket/utils/phenopacket_utils.py @@ -861,7 +861,7 @@ def __init__(self, phenopacket: Phenopacket): @staticmethod def create_gene_genomic_interpretation( - gene_to_phenotype_entry: dict, gene_identifier_updater: GeneIdentifierUpdater + gene_to_disease_entry: dict, gene_identifier_updater: GeneIdentifierUpdater ) -> GenomicInterpretation: """ Create genomic interpretation for a gene-to-phenotype relationship. @@ -869,7 +869,7 @@ def create_gene_genomic_interpretation( This method generates a GenomicInterpretation object based on a gene-to-phenotype relationship entry. Args: - gene_to_phenotype_entry (dict): A dictionary representing a gene-to-phenotype relationship. + gene_to_disease_entry (dict): A dictionary representing a gene-to-disease relationship. gene_identifier_updater (GeneIdentifierUpdater): An instance of GeneIdentifierUpdater. Returns: @@ -877,29 +877,25 @@ def create_gene_genomic_interpretation( of the gene-to-phenotype relationship or None if unsuccessful. """ try: - gene_symbol = gene_identifier_updater.obtain_gene_symbol_from_identifier( - str(gene_to_phenotype_entry["entrez_id"]) - ) + gene_symbol = gene_to_disease_entry["gene_symbol"] return GenomicInterpretation( subject_or_biosample_id="patient1", - interpretation_status=( - 4 if gene_to_phenotype_entry["disease_name"].startswith("?") is False else 0 - ), + interpretation_status=4, gene=GeneDescriptor( value_id=gene_identifier_updater.find_identifier(gene_symbol), symbol=gene_symbol, ), ) except KeyError: - print(f"Unable to find gene_symbol for {gene_to_phenotype_entry['entrez_id']}") + print(f"Unable to find gene_symbol for {gene_to_disease_entry['entrez_id']}") return None except TypeError: - print("N/A value", gene_to_phenotype_entry) + print("N/A value", gene_to_disease_entry) return None def create_gene_genomic_interpretations( self, - omim_disease_phenotype_gene_map: pl.DataFrame, + genes_to_disease_map: pl.DataFrame, gene_identifier_updater: GeneIdentifierUpdater, ) -> List[GenomicInterpretation]: """ @@ -909,7 +905,7 @@ def create_gene_genomic_interpretations( containing known gene-to-phenotype relationships. Args: - omim_disease_phenotype_gene_map (pl.DataFrame): DataFrame containing gene-to-phenotype mappings. + genes_to_disease_map (pl.DataFrame): DataFrame containing gene-to-disease mappings. gene_identifier_updater (GeneIdentifierUpdater): An instance of GeneIdentifierUpdater. Returns: @@ -917,9 +913,9 @@ def create_gene_genomic_interpretations( of gene-to-phenotype relationships. """ genomic_interpretations = [] - for phenotype_entry in omim_disease_phenotype_gene_map.rows(named=True): + for disease_entry in genes_to_disease_map.rows(named=True): genomic_interpretation = self.create_gene_genomic_interpretation( - phenotype_entry, gene_identifier_updater + disease_entry, gene_identifier_updater ) if genomic_interpretation is not None: genomic_interpretations.append(genomic_interpretation) @@ -927,7 +923,7 @@ def create_gene_genomic_interpretations( def create_gene_diagnosis( self, - omim_disease_phenotype_gene_map: pl.DataFrame, + genes_to_disease_map: pl.DataFrame, gene_identifier_updater: GeneIdentifierUpdater, disease: Disease, ) -> Diagnosis: @@ -938,7 +934,7 @@ def create_gene_diagnosis( provided in a DataFrame and a Disease object. Args: - omim_disease_phenotype_gene_map (pl.DataFrame): DataFrame containing gene-to-phenotype mappings. + genes_to_disease_map (pl.DataFrame): DataFrame containing gene-to-disease mappings. gene_identifier_updater (GeneIdentifierUpdater): An instance of GeneIdentifierUpdater. disease (Disease): An instance of Disease representing the disease information. @@ -947,7 +943,7 @@ def create_gene_diagnosis( or None if no genomic interpretations were found. """ genomic_interpretations = self.create_gene_genomic_interpretations( - omim_disease_phenotype_gene_map, gene_identifier_updater + genes_to_disease_map, gene_identifier_updater ) return ( Diagnosis( @@ -963,7 +959,7 @@ def create_gene_diagnosis( def create_gene_interpretation( self, - omim_disease_phenotype_gene_map: pl.DataFrame, + genes_to_disease_map: pl.DataFrame, gene_identifier_updater: GeneIdentifierUpdater, ) -> Interpretation: """ @@ -973,7 +969,7 @@ def create_gene_interpretation( provided in a DataFrame and a GeneIdentifierUpdater instance. Args: - omim_disease_phenotype_gene_map (pl.DataFrame): DataFrame containing gene-to-phenotype mappings. + genes_to_disease_map (pl.DataFrame): DataFrame containing gene-to-disease mappings. gene_identifier_updater (GeneIdentifierUpdater): An instance of GeneIdentifierUpdater. Returns: @@ -983,7 +979,7 @@ def create_gene_interpretation( phenopacket_util = PhenopacketUtil(self.phenopacket) disease = phenopacket_util.return_phenopacket_disease() diagnosis = self.create_gene_diagnosis( - omim_disease_phenotype_gene_map, gene_identifier_updater, disease + genes_to_disease_map, gene_identifier_updater, disease ) return ( Interpretation( @@ -997,7 +993,7 @@ def create_gene_interpretation( def add_gene_interpretation_to_phenopacket( self, - omim_disease_phenotype_gene_map: pl.DataFrame, + genes_to_disease_map: pl.DataFrame, gene_identifier_updater: GeneIdentifierUpdater, ) -> Phenopacket: """ @@ -1006,7 +1002,7 @@ def add_gene_interpretation_to_phenopacket( This method adds gene-based interpretations to a copy of the Phenopacket. Args: - omim_disease_phenotype_gene_map (pl.DataFrame): DataFrame containing gene-to-phenotype mappings. + genes_to_disease_map (pl.DataFrame): DataFrame containing gene-to-disease mappings. gene_identifier_updater (GeneIdentifierUpdater): An instance of GeneIdentifierUpdater. Returns: @@ -1015,9 +1011,7 @@ def add_gene_interpretation_to_phenopacket( """ phenopacket_copy = copy(self.phenopacket) interpretations = [ - self.create_gene_interpretation( - omim_disease_phenotype_gene_map, gene_identifier_updater - ) + self.create_gene_interpretation(genes_to_disease_map, gene_identifier_updater) ] if interpretations is not None: phenopacket_copy.interpretations.extend(interpretations) diff --git a/src/phenotype2phenopacket/utils/utils.py b/src/phenotype2phenopacket/utils/utils.py index e28f2fd..e2bfda9 100644 --- a/src/phenotype2phenopacket/utils/utils.py +++ b/src/phenotype2phenopacket/utils/utils.py @@ -28,35 +28,46 @@ def is_float(element: any) -> bool: return False -def read_disease_pg(disease_pg: Path) -> pl.DataFrame: +def read_genes_to_disease(genes_to_disease: Path) -> pl.DataFrame: """ - Read a disease.pg file and return a filtered Polars DataFrame. - - This function reads the contents of a 'disease.pg' file using Polars read_csv method - and constructs a DataFrame. It filters the DataFrame to include only rows where the 'database_id' - column starts with 'OMIM'. - + Read the genes_to_disease.txt file and return a Polars DataFrame. Args: - disease_pg (Path): The path to the 'disease.pg' file. - + genes_to_disease (Path): Path to the genes_to_disease.txt file. Returns: - pl.DataFrame: A filtered Polars DataFrame containing specific columns and rows - where 'database_id' starts with 'OMIM'. - """ - disease = pl.read_csv( - disease_pg, - separator="|", - new_columns=[ - "database_id", - "gene_mim_number", - "disease_name", - "entrez_id", - "diagnosis_status", - "inheritance", - ], - has_header=False, - ) - return disease.filter(pl.col("database_id").str.starts_with("OMIM")) + pl.DataFrame: A Polars DataFrame containing the contents of the genes_to_disease.txt. + """ + return pl.read_csv(genes_to_disease, sep="\t") + + +# def read_disease_pg(disease_pg: Path) -> pl.DataFrame: +# """ +# Read a disease.pg file and return a filtered Polars DataFrame. +# +# This function reads the contents of a 'disease.pg' file using Polars read_csv method +# and constructs a DataFrame. It filters the DataFrame to include only rows where the 'database_id' +# column starts with 'OMIM'. +# +# Args: +# disease_pg (Path): The path to the 'disease.pg' file. +# +# Returns: +# pl.DataFrame: A filtered Polars DataFrame containing specific columns and rows +# where 'database_id' starts with 'OMIM'. +# """ +# disease = pl.read_csv( +# disease_pg, +# separator="|", +# new_columns=[ +# "database_id", +# "gene_mim_number", +# "disease_name", +# "entrez_id", +# "diagnosis_status", +# "inheritance", +# ], +# has_header=False, +# ) +# return disease.filter(pl.col("database_id").str.starts_with("OMIM")) def load_ontology(local_cached_ontology: Path = None): diff --git a/tests/test_phenopacket_utils.py b/tests/test_phenopacket_utils.py index 7aaaa03..9a2059c 100644 --- a/tests/test_phenopacket_utils.py +++ b/tests/test_phenopacket_utils.py @@ -202,21 +202,11 @@ gene_to_phenotypes = pl.from_dicts( [ + {"disease_id": "OMIM:612567", "ncbi_gene_id": "NCBIGene:3588", "gene_symbol": "IL10RB"}, { - "database_id": "OMIM:612567", - "gene_mim_number": "OMIM:123889", - "disease_name": "Inflammatory bowel disease 25, early onset, autosomal recessive", - "entrez_id": "3588", - "diagnosis_status": "D", - "inheritance": "R", - }, - { - "database_id": "OMIM:612567", - "gene_mim_number": "OMIM:123889", - "disease_name": "Inflammatory bowel disease 25, early onset, autosomal recessive", - "entrez_id": "1", - "diagnosis_status": "D", - "inheritance": "R", + "disease_id": "OMIM:612567", + "ncbi_gene_id": "NCBIGene:1", + "gene_symbol": "ADA", }, ] ) @@ -1179,12 +1169,9 @@ class TestPhenopacketInterpretationExtender(unittest.TestCase): def setUpClass(cls) -> None: cls.phenopacket = PhenopacketInterpretationExtender(phenopacket) cls.phenotype_to_gene_entry = { - "database_id": "OMIM:612567", - "gene_mim_number": "OMIM:123889", - "disease_name": "Inflammatory bowel disease 25, early onset, autosomal recessive", - "entrez_id": "3588", - "diagnosis_status": "D", - "inheritance": "R", + "disease_id": "OMIM:612567", + "ncbi_gene_id": "NCBIGene:3588", + "gene_symbol": "IL10RB", } cls.gene_identifier_updater = GeneIdentifierUpdater( gene_identifier="ensembl_id", @@ -1224,7 +1211,7 @@ def test_create_gene_genomic_interpretations(self): GenomicInterpretation( subject_or_biosample_id="patient1", interpretation_status=4, - gene=GeneDescriptor(value_id="ENSG00000121410", symbol="A1BG"), + gene=GeneDescriptor(value_id="ENSG00000196839", symbol="ADA"), ), ], ) @@ -1258,7 +1245,7 @@ def test_create_gene_diagnosis(self): GenomicInterpretation( subject_or_biosample_id="patient1", interpretation_status=4, - gene=GeneDescriptor(value_id="ENSG00000121410", symbol="A1BG"), + gene=GeneDescriptor(value_id="ENSG00000196839", symbol="ADA"), ), ], ), @@ -1290,7 +1277,7 @@ def test_create_gene_interpretation(self): GenomicInterpretation( subject_or_biosample_id="patient1", interpretation_status=4, - gene=GeneDescriptor(value_id="ENSG00000121410", symbol="A1BG"), + gene=GeneDescriptor(value_id="ENSG00000196839", symbol="ADA"), ), ], ), From 0fb602829ca36cce86f13be0b4da3bec43225cf7 Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Fri, 8 Nov 2024 15:25:50 +0000 Subject: [PATCH 3/6] remove method for reading disease.pg --- src/phenotype2phenopacket/utils/utils.py | 39 +++--------------------- 1 file changed, 4 insertions(+), 35 deletions(-) diff --git a/src/phenotype2phenopacket/utils/utils.py b/src/phenotype2phenopacket/utils/utils.py index e2bfda9..e87331b 100644 --- a/src/phenotype2phenopacket/utils/utils.py +++ b/src/phenotype2phenopacket/utils/utils.py @@ -39,37 +39,6 @@ def read_genes_to_disease(genes_to_disease: Path) -> pl.DataFrame: return pl.read_csv(genes_to_disease, sep="\t") -# def read_disease_pg(disease_pg: Path) -> pl.DataFrame: -# """ -# Read a disease.pg file and return a filtered Polars DataFrame. -# -# This function reads the contents of a 'disease.pg' file using Polars read_csv method -# and constructs a DataFrame. It filters the DataFrame to include only rows where the 'database_id' -# column starts with 'OMIM'. -# -# Args: -# disease_pg (Path): The path to the 'disease.pg' file. -# -# Returns: -# pl.DataFrame: A filtered Polars DataFrame containing specific columns and rows -# where 'database_id' starts with 'OMIM'. -# """ -# disease = pl.read_csv( -# disease_pg, -# separator="|", -# new_columns=[ -# "database_id", -# "gene_mim_number", -# "disease_name", -# "entrez_id", -# "diagnosis_status", -# "inheritance", -# ], -# has_header=False, -# ) -# return disease.filter(pl.col("database_id").str.starts_with("OMIM")) - - def load_ontology(local_cached_ontology: Path = None): """ Load the Human Phenotype Ontology (HPO). @@ -215,10 +184,10 @@ def read_omim_id_list(omim_id_list_file_path: Path) -> List[str]: def filter_diseases( - num_disease: int, - omim_id: str, - omim_id_list: Path, - phenotype_annotation_data: PhenotypeAnnotation, + num_disease: int, + omim_id: str, + omim_id_list: Path, + phenotype_annotation_data: PhenotypeAnnotation, ) -> List[pl.DataFrame]: """ Filter the phenotype annotation data to either only a specific disease, a specific number of diseases, From 798d185579fe2bf0677ab971f16af4aa3a9da5d5 Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Fri, 8 Nov 2024 15:30:22 +0000 Subject: [PATCH 4/6] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 10eea34..80f959f 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ p2p create --phenotype-annotation /path/to/phenotype.hpoa --output-dir /path/to/ To add known gene-to-phenotype relationships to phenopackets: ```shell -p2p add-genes --phenopacket-dir /path/to/synthetic-phenopackets --disease-pg /path/to/disease.pg --hgnc-data /path/to/hgnc_complete_set.txt --output-dir /path/to/output-dir +p2p add-genes --phenopacket-dir /path/to/synthetic-phenopackets --genes-to-disease /path/to/genes_to_disease.txt --hgnc-data /path/to/hgnc_complete_set.txt --output-dir /path/to/output-dir ``` -> **_NOTE:_** To add known gene-to-phenotype the Exomiser disease.pg file is expected \ No newline at end of file +> **_NOTE:_** To add known gene-to-phenotype the genes_to_disease.txt is expected. It can be downloaded [here](https://hpo.jax.org/data/annotations). \ No newline at end of file From 9602306f46d086590376bf562d0374f3e0cfb042 Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Fri, 8 Nov 2024 15:30:56 +0000 Subject: [PATCH 5/6] tox lint --- src/phenotype2phenopacket/utils/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/phenotype2phenopacket/utils/utils.py b/src/phenotype2phenopacket/utils/utils.py index e87331b..0b5a34a 100644 --- a/src/phenotype2phenopacket/utils/utils.py +++ b/src/phenotype2phenopacket/utils/utils.py @@ -184,10 +184,10 @@ def read_omim_id_list(omim_id_list_file_path: Path) -> List[str]: def filter_diseases( - num_disease: int, - omim_id: str, - omim_id_list: Path, - phenotype_annotation_data: PhenotypeAnnotation, + num_disease: int, + omim_id: str, + omim_id_list: Path, + phenotype_annotation_data: PhenotypeAnnotation, ) -> List[pl.DataFrame]: """ Filter the phenotype annotation data to either only a specific disease, a specific number of diseases, From 19d579df48b51b29b1ebb652a439a82fc197c2cc Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Fri, 8 Nov 2024 15:31:37 +0000 Subject: [PATCH 6/6] bump project version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2c3079f..1517a35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "phenotype2phenopacket" -version = "0.5.1" +version = "0.6.0" description = "" authors = ["Yasemin Bridges "] readme = "README.md"