Skip to content

Commit

Permalink
✨ Reintroduce taxa (#168)
Browse files Browse the repository at this point in the history
Signed-off-by: zethson <[email protected]>
  • Loading branch information
Zethson authored Nov 19, 2024
1 parent cb9c2da commit 71a77ce
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 14 deletions.
32 changes: 22 additions & 10 deletions bionty/base/entities/_gene.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

from collections.abc import Iterable
from typing import TYPE_CHECKING, Literal, NamedTuple

import pandas as pd
Expand Down Expand Up @@ -97,23 +96,34 @@ def map_legacy_ids(self, values: Iterable) -> MappingResult:


class EnsemblGene:
def __init__(self, organism: str, version: str) -> None:
def __init__(
self,
organism: str,
version: str,
taxa: Literal[
"vertebrates", "bacteria", "fungi", "metazoa", "plants", "all"
] = "vertebrates",
) -> None:
"""Ensembl Gene mysql.
Args:
organism: a bionty.Organism object
version: name of the ensembl DB version, e.g. "release-110"
organism: Name of the organism
version: Name of the ensembl DB version, e.g. "release-110"
taxa: The taxa of the organism to fetch genes for.
"""
self._import()
import mysql.connector as sql
from sqlalchemy import create_engine

self._organism = (
Organism(version=version).lookup().dict().get(organism) # type:ignore
)
self._url = (
f"mysql+mysqldb://anonymous:@ensembldb.ensembl.org/{self._organism.core_db}"
Organism(version=version, taxa=taxa).lookup().dict().get(organism) # type:ignore
)
# vertebrates and plants use different ports
if taxa == "plants":
port = 4157
else:
port = 3306
self._url = f"mysql+mysqldb://anonymous:@ensembldb.ensembl.org:{port}/{self._organism.core_db}"
self._engine = create_engine(url=self._url)

def _import(self):
Expand Down Expand Up @@ -232,8 +242,10 @@ def add_external_db_column(df: pd.DataFrame, ext_db: str, df_col: str):
df_res = df_res[~df_res["ensembl_gene_id"].isna()]

# if stable_id is not ensembl_gene_id, keep a stable_id column
if not any(df_res["ensembl_gene_id"].str.startswith("ENS")):
logger.warning("no ensembl_gene_id found, writing to table_id column.")
if not all(df_res["ensembl_gene_id"].str.startswith("ENS")):
logger.warning(
"ensembl_gene_id column not all ENS-prefixed, writing to stable_id column."
)
df_res.insert(0, "stable_id", df_res.pop("ensembl_gene_id"))
df_res = df_res.sort_values("stable_id").reset_index(drop=True)
else:
Expand Down
10 changes: 6 additions & 4 deletions bionty/base/entities/_organism.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ class Organism(PublicOntology):

def __init__(
self,
organism: Literal[
"vertebrates", "bacteria", "fungi", "metazoa", "plants", "all"
]
taxa: Literal["vertebrates", "bacteria", "fungi", "metazoa", "plants", "all"]
| None = None,
source: Literal["ensembl", "ncbitaxon"] | None = None,
version: Literal[
Expand All @@ -39,7 +37,11 @@ def __init__(
| None = None,
**kwargs,
):
super().__init__(organism=organism, source=source, version=version, **kwargs)
# To support the organism kwarg being passed in getattr access in other parts of the code
# https://github.com/laminlabs/bionty/issues/163
if "organism" in kwargs and taxa is None:
taxa = kwargs.pop("organism")
super().__init__(organism=taxa, source=source, version=version, **kwargs)

def _load_df(self) -> pd.DataFrame:
if self.source == "ensembl":
Expand Down

0 comments on commit 71a77ce

Please sign in to comment.