From e75518cc5474084c15cd45a39aac0b89ee7db4cb Mon Sep 17 00:00:00 2001 From: Jacob Moss Date: Tue, 12 Nov 2024 14:32:07 +0000 Subject: [PATCH 01/10] add option to specify kingdom --- bionty/base/entities/_gene.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bionty/base/entities/_gene.py b/bionty/base/entities/_gene.py index 4543ee2..2f7a544 100644 --- a/bionty/base/entities/_gene.py +++ b/bionty/base/entities/_gene.py @@ -86,7 +86,7 @@ def map_legacy_ids( class EnsemblGene: - def __init__(self, organism: str, version: str) -> None: + def __init__(self, organism: str, version: str, kingdom: Literal['vertibrates', 'plants']=None) -> None: """Ensembl Gene mysql. Args: @@ -98,7 +98,7 @@ def __init__(self, organism: str, version: str) -> None: from sqlalchemy import create_engine self._organism = ( - Organism(version=version).lookup().dict().get(organism) # type:ignore + Organism(version=version, organism=kingdom).lookup().dict().get(organism) # type:ignore ) self._url = ( f"mysql+mysqldb://anonymous:@ensembldb.ensembl.org/{self._organism.core_db}" From a1d2d6a1d12050f9650ab428a4d539700c2f6879 Mon Sep 17 00:00:00 2001 From: Jacob Moss Date: Tue, 12 Nov 2024 17:12:24 +0000 Subject: [PATCH 02/10] fix port for plants --- bionty/base/entities/_gene.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bionty/base/entities/_gene.py b/bionty/base/entities/_gene.py index 2f7a544..3d82fd4 100644 --- a/bionty/base/entities/_gene.py +++ b/bionty/base/entities/_gene.py @@ -100,8 +100,11 @@ def __init__(self, organism: str, version: str, kingdom: Literal['vertibrates', self._organism = ( Organism(version=version, organism=kingdom).lookup().dict().get(organism) # type:ignore ) + port = 3306 + if kingdom == "plants": + port = 4157 self._url = ( - f"mysql+mysqldb://anonymous:@ensembldb.ensembl.org/{self._organism.core_db}" + f"mysql+mysqldb://anonymous:@ensembldb.ensembl.org:{port}/{self._organism.core_db}" ) self._engine = create_engine(url=self._url) From f4147172ffefd2a6c087fc9bd3d63dc7170d003d Mon Sep 17 00:00:00 2001 From: zethson Date: Wed, 13 Nov 2024 15:05:23 +0100 Subject: [PATCH 03/10] :art: Polish Signed-off-by: zethson --- bionty/base/entities/_gene.py | 18 +++++++++++------- bionty/base/entities/_organism.py | 6 ++---- tests/entities/test_organism.py | 2 +- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/bionty/base/entities/_gene.py b/bionty/base/entities/_gene.py index 3d82fd4..8bcb604 100644 --- a/bionty/base/entities/_gene.py +++ b/bionty/base/entities/_gene.py @@ -86,26 +86,30 @@ def map_legacy_ids( class EnsemblGene: - def __init__(self, organism: str, version: str, kingdom: Literal['vertibrates', 'plants']=None) -> None: + def __init__( + self, + organism: str, + version: str, + kingdom: Literal["vertibrates", "plants"] = None, + ) -> None: """Ensembl Gene mysql. Args: - organism: a bionty.Organism object - version: name of the ensembl DB version, e.g. "release-110" + organism: A `bionty.Organism` object + version: Name of the ensembl DB version, e.g. "release-110" + kingdom: Kingdom of the organism to fetch the genes for """ self._import() import mysql.connector as sql from sqlalchemy import create_engine self._organism = ( - Organism(version=version, organism=kingdom).lookup().dict().get(organism) # type:ignore + Organism(version=version, kingdom=kingdom).lookup().dict().get(organism) # type:ignore ) port = 3306 if kingdom == "plants": port = 4157 - self._url = ( - f"mysql+mysqldb://anonymous:@ensembldb.ensembl.org:{port}/{self._organism.core_db}" - ) + self._url = f"mysql+mysqldb://anonymous:@ensembldb.ensembl.org:{port}/{self._organism.core_db}" self._engine = create_engine(url=self._url) def _import(self): diff --git a/bionty/base/entities/_organism.py b/bionty/base/entities/_organism.py index 6f9c154..826be02 100644 --- a/bionty/base/entities/_organism.py +++ b/bionty/base/entities/_organism.py @@ -22,9 +22,7 @@ class Organism(PublicOntology): def __init__( self, - organism: Literal[ - "vertebrates", "bacteria", "fungi", "metazoa", "plants", "all" - ] + kingdom: Literal["vertebrates", "bacteria", "fungi", "metazoa", "plants", "all"] | None = None, source: Literal["ensembl", "ncbitaxon"] | None = None, version: Literal[ @@ -39,7 +37,7 @@ def __init__( | None = None, **kwargs, ): - super().__init__(organism=organism, source=source, version=version, **kwargs) + super().__init__(organism=kingdom, source=source, version=version, **kwargs) def _load_df(self) -> pd.DataFrame: if self.source == "ensembl": diff --git a/tests/entities/test_organism.py b/tests/entities/test_organism.py index f2a18af..8b5fdc3 100644 --- a/tests/entities/test_organism.py +++ b/tests/entities/test_organism.py @@ -24,7 +24,7 @@ def test_ensembl_organism_inspect_name(): def test_ensembl_organism_organism(): for sp in ["bacteria", "plants", "fungi", "metazoa"]: - df = bt_base.Organism(organism=sp).df() + df = bt_base.Organism(kingdom=sp).df() assert df.shape[0] > 10 From f6758ac1b17e6cd1f46ca5d474a2255e5b0e7f00 Mon Sep 17 00:00:00 2001 From: zethson Date: Thu, 14 Nov 2024 12:40:13 +0100 Subject: [PATCH 04/10] :art: Enable SQL queries again Signed-off-by: zethson --- bionty/base/entities/_gene.py | 71 +++++++++++++++++-------------- bionty/base/entities/_organism.py | 4 +- tests/entities/test_organism.py | 2 +- 3 files changed, 43 insertions(+), 34 deletions(-) diff --git a/bionty/base/entities/_gene.py b/bionty/base/entities/_gene.py index 8bcb604..180e9a0 100644 --- a/bionty/base/entities/_gene.py +++ b/bionty/base/entities/_gene.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Dict, Iterable, Literal, Optional +from typing import Iterable, Literal import pandas as pd from lamin_utils import logger @@ -90,25 +90,28 @@ def __init__( self, organism: str, version: str, - kingdom: Literal["vertibrates", "plants"] = None, + taxa: Literal[ + "vertebrates", "bacteria", "fungi", "metazoa", "plants", "all" + ] = "vertebrates", ) -> None: """Ensembl Gene mysql. Args: - organism: A `bionty.Organism` object + organism: A version: Name of the ensembl DB version, e.g. "release-110" - kingdom: Kingdom of the organism to fetch the genes for """ self._import() import mysql.connector as sql from sqlalchemy import create_engine self._organism = ( - Organism(version=version, kingdom=kingdom).lookup().dict().get(organism) # type:ignore + Organism(version=version, taxa=taxa).lookup().dict().get(organism) # type:ignore ) - port = 3306 - if kingdom == "plants": + # vertebrates and plants use different ports + if taxa == "plants": port = 4157 + else: + port = 3306 self._url = f"mysql+mysqldb://anonymous:@ensembldb.ensembl.org:{port}/{self._organism.core_db}" self._engine = create_engine(url=self._url) @@ -157,23 +160,15 @@ def download_df(self, external_db_names: dict | None = None) -> pd.DataFrame: WHERE object_xref.ensembl_object_type = 'Gene' AND external_db.db_name IN ({external_db_names_str}) # noqa """ - # Query for the basic gene annotations: - results_core = pd.read_sql(query_core, con=self._engine) - logger.info("fetching records from the core DB...") - - # aggregate metadata based on ensembl stable_id - results_core_group = results_core.groupby("stable_id").agg( - { - "display_label": "first", - "biotype": "first", - "description": "first", - "synonym": lambda x: "|".join([i for i in set(x) if i is not None]), - } - ) + from sqlalchemy.sql import text - # Query for external ids: - results_external = pd.read_sql(query_external, con=self._engine) - logger.info("fetching records from the external DBs...") + with self._engine.connect() as conn: + # Execute queries using SQLAlchemy text construct + results_core_group = pd.DataFrame(conn.execute(text(query_core))) + logger.info("fetching records from the core DB...") + + results_external = pd.DataFrame(conn.execute(text(query_external))) + logger.info("fetching records from the external DBs...") def add_external_db_column(df: pd.DataFrame, ext_db: str, df_col: str): # ncbi_gene_id @@ -240,15 +235,29 @@ def add_external_db_column(df: pd.DataFrame, ext_db: str, df_col: str): return df_res - def download_legacy_ids_df(self, df: pd.DataFrame, col: str | None = None): - col = "ensembl_gene_id" if col is None else col + def download_legacy_ids_df(self, df: pd.DataFrame, col: str = "ensembl_gene_id"): + """Download legacy IDs for given gene IDs. + + Args: + df: DataFrame containing gene IDs + col: Column name containing gene IDs, defaults to "ensembl_gene_id" + """ + from sqlalchemy.sql import text + current_ids = tuple(df[col]) - results = pd.read_sql( - "SELECT * FROM stable_id_event JOIN mapping_session USING" - " (mapping_session_id) WHERE type = 'gene' AND new_stable_id IN" - f" {current_ids} AND score > 0 AND old_stable_id != new_stable_id", - con=self._engine, - ) + + query = text(""" + SELECT * FROM stable_id_event + JOIN mapping_session USING (mapping_session_id) + WHERE type = 'gene' + AND new_stable_id IN :current_ids + AND score > 0 + AND old_stable_id != new_stable_id + """) + + with self._engine.connect() as conn: + results = pd.DataFrame(conn.execute(query, {"current_ids": current_ids})) + return results def _process_convert_result( diff --git a/bionty/base/entities/_organism.py b/bionty/base/entities/_organism.py index 826be02..088f73e 100644 --- a/bionty/base/entities/_organism.py +++ b/bionty/base/entities/_organism.py @@ -22,7 +22,7 @@ class Organism(PublicOntology): def __init__( self, - kingdom: Literal["vertebrates", "bacteria", "fungi", "metazoa", "plants", "all"] + taxa: Literal["vertebrates", "bacteria", "fungi", "metazoa", "plants", "all"] | None = None, source: Literal["ensembl", "ncbitaxon"] | None = None, version: Literal[ @@ -37,7 +37,7 @@ def __init__( | None = None, **kwargs, ): - super().__init__(organism=kingdom, source=source, version=version, **kwargs) + super().__init__(organism=taxa, source=source, version=version, **kwargs) def _load_df(self) -> pd.DataFrame: if self.source == "ensembl": diff --git a/tests/entities/test_organism.py b/tests/entities/test_organism.py index 8b5fdc3..f2a18af 100644 --- a/tests/entities/test_organism.py +++ b/tests/entities/test_organism.py @@ -24,7 +24,7 @@ def test_ensembl_organism_inspect_name(): def test_ensembl_organism_organism(): for sp in ["bacteria", "plants", "fungi", "metazoa"]: - df = bt_base.Organism(kingdom=sp).df() + df = bt_base.Organism(organism=sp).df() assert df.shape[0] > 10 From d003058e965c95a8707e7a260346e468576f9a65 Mon Sep 17 00:00:00 2001 From: zethson Date: Thu, 14 Nov 2024 13:23:01 +0100 Subject: [PATCH 05/10] :art: Polish Signed-off-by: zethson --- bionty/base/entities/_gene.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bionty/base/entities/_gene.py b/bionty/base/entities/_gene.py index 180e9a0..6ed8516 100644 --- a/bionty/base/entities/_gene.py +++ b/bionty/base/entities/_gene.py @@ -97,7 +97,7 @@ def __init__( """Ensembl Gene mysql. Args: - organism: A + organism: Name of the organism version: Name of the ensembl DB version, e.g. "release-110" """ self._import() From 96689feb06bee36fbb4e739f58cdd518aed9e4cb Mon Sep 17 00:00:00 2001 From: zethson Date: Thu, 14 Nov 2024 15:23:26 +0100 Subject: [PATCH 06/10] :art: Revert sql statement changes Signed-off-by: zethson --- bionty/base/entities/_gene.py | 54 ++++++++++++++----------------- bionty/base/entities/_organism.py | 3 ++ 2 files changed, 27 insertions(+), 30 deletions(-) diff --git a/bionty/base/entities/_gene.py b/bionty/base/entities/_gene.py index 6ed8516..1faecd3 100644 --- a/bionty/base/entities/_gene.py +++ b/bionty/base/entities/_gene.py @@ -160,15 +160,23 @@ def download_df(self, external_db_names: dict | None = None) -> pd.DataFrame: WHERE object_xref.ensembl_object_type = 'Gene' AND external_db.db_name IN ({external_db_names_str}) # noqa """ - from sqlalchemy.sql import text - - with self._engine.connect() as conn: - # Execute queries using SQLAlchemy text construct - results_core_group = pd.DataFrame(conn.execute(text(query_core))) - logger.info("fetching records from the core DB...") + # Query for the basic gene annotations: + results_core = pd.read_sql(query_core, con=self._engine) + logger.info("fetching records from the core DB...") + + # aggregate metadata based on ensembl stable_id + results_core_group = results_core.groupby("stable_id").agg( + { + "display_label": "first", + "biotype": "first", + "description": "first", + "synonym": lambda x: "|".join([i for i in set(x) if i is not None]), + } + ) - results_external = pd.DataFrame(conn.execute(text(query_external))) - logger.info("fetching records from the external DBs...") + # Query for external ids: + results_external = pd.read_sql(query_external, con=self._engine) + logger.info("fetching records from the external DBs...") def add_external_db_column(df: pd.DataFrame, ext_db: str, df_col: str): # ncbi_gene_id @@ -235,29 +243,15 @@ def add_external_db_column(df: pd.DataFrame, ext_db: str, df_col: str): return df_res - def download_legacy_ids_df(self, df: pd.DataFrame, col: str = "ensembl_gene_id"): - """Download legacy IDs for given gene IDs. - - Args: - df: DataFrame containing gene IDs - col: Column name containing gene IDs, defaults to "ensembl_gene_id" - """ - from sqlalchemy.sql import text - + def download_legacy_ids_df(self, df: pd.DataFrame, col: str | None = None): + col = "ensembl_gene_id" if col is None else col current_ids = tuple(df[col]) - - query = text(""" - SELECT * FROM stable_id_event - JOIN mapping_session USING (mapping_session_id) - WHERE type = 'gene' - AND new_stable_id IN :current_ids - AND score > 0 - AND old_stable_id != new_stable_id - """) - - with self._engine.connect() as conn: - results = pd.DataFrame(conn.execute(query, {"current_ids": current_ids})) - + results = pd.read_sql( + "SELECT * FROM stable_id_event JOIN mapping_session USING" + " (mapping_session_id) WHERE type = 'gene' AND new_stable_id IN" + f" {current_ids} AND score > 0 AND old_stable_id != new_stable_id", + con=self._engine, + ) return results def _process_convert_result( diff --git a/bionty/base/entities/_organism.py b/bionty/base/entities/_organism.py index 088f73e..ab61a8f 100644 --- a/bionty/base/entities/_organism.py +++ b/bionty/base/entities/_organism.py @@ -37,6 +37,9 @@ def __init__( | None = None, **kwargs, ): + # To support the organism kwarg being passed in getattr access in other parts of the code + if kwargs.get("organism") is not None: + taxa = kwargs.pop("organism") super().__init__(organism=taxa, source=source, version=version, **kwargs) def _load_df(self) -> pd.DataFrame: From d4e867457c4a933c62e7992719034a5d350dfcf6 Mon Sep 17 00:00:00 2001 From: Jacob Moss Date: Thu, 14 Nov 2024 15:50:23 +0000 Subject: [PATCH 07/10] support where some of ensembl_gene_id column are not ensembl gene ids --- bionty/base/entities/_gene.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bionty/base/entities/_gene.py b/bionty/base/entities/_gene.py index 1faecd3..7a5fc01 100644 --- a/bionty/base/entities/_gene.py +++ b/bionty/base/entities/_gene.py @@ -231,8 +231,8 @@ def add_external_db_column(df: pd.DataFrame, ext_db: str, df_col: str): df_res = df_res[~df_res["ensembl_gene_id"].isna()] # if stable_id is not ensembl_gene_id, keep a stable_id column - if not any(df_res["ensembl_gene_id"].str.startswith("ENS")): - logger.warning("no ensembl_gene_id found, writing to table_id column.") + if not all(df_res["ensembl_gene_id"].str.startswith("ENS")): + logger.warning("ensembl_gene_id column not all ENS-prefixed, writing to stable_id column.") df_res.insert(0, "stable_id", df_res.pop("ensembl_gene_id")) df_res = df_res.sort_values("stable_id").reset_index(drop=True) else: From 6e91ff71032e4040d268dec9b80b116f747f40a3 Mon Sep 17 00:00:00 2001 From: Lukas Heumos Date: Fri, 15 Nov 2024 10:57:29 +0100 Subject: [PATCH 08/10] =?UTF-8?q?=E2=9C=A8=20Add=20fork=20protected=20CI?= =?UTF-8?q?=20(#157)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: zethson --- .github/workflows/build.yml | 44 +++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 604f6be..b081427 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,11 +1,11 @@ name: build - on: push: branches: [main] pull_request: branches: [main, staging] workflow_dispatch: + env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} @@ -17,45 +17,59 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.11"] + python-version: ["3.12"] group: ["bionty-unit", "bionty-docs"] timeout-minutes: 25 - steps: - uses: actions/checkout@v4 - name: checkout lndocs + if: ${{ github.event.pull_request.head.repo.full_name == github.repository }} uses: actions/checkout@v4 with: repository: laminlabs/lndocs ssh-key: ${{ secrets.READ_LNDOCS }} path: lndocs ref: main + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + - uses: actions/cache@v3 with: path: ~/.cache/pre-commit key: pre-commit-${{ runner.os }}-${{ hashFiles('.pre-commit-config.yaml') }} - - run: pip install "laminci@git+https://x-access-token:${{ secrets.LAMIN_BUILD_DOCS }}@github.com/laminlabs/laminci" - - run: | + + - name: Install laminci + run: pip install "laminci@git+https://x-access-token:${{ secrets.LAMIN_BUILD_DOCS }}@github.com/laminlabs/laminci" + + - name: Install dependencies + run: | uv pip install --system rich uv pip install --system ipywidgets + - uses: aws-actions/configure-aws-credentials@v4 with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: eu-central-1 - - run: nox -s lint - if: matrix.python-version == '3.11' && matrix.group == 'bionty-unit' - - run: nox -s "build(group='${{ matrix.group }}')" + + - name: Run lint + if: matrix.group == 'bionty-unit' + run: nox -s lint + + - name: Run build + if: ${{ (matrix.group == 'bionty-docs' && github.event.pull_request.head.repo.full_name == github.repository) }} + run: nox -s "build(group='${{ matrix.group }}')" + - uses: actions/upload-artifact@v3 with: name: coverage--${{ matrix.group }} path: .coverage include-hidden-files: true + - uses: nwtgck/actions-netlify@v1.2 - if: ${{ matrix.python-version == '3.11' && matrix.group == 'bionty-docs' }} + if: ${{ (matrix.group == 'bionty-docs' && github.event.pull_request.head.repo.full_name == github.repository) }} with: publish-dir: "_build/html" production-deploy: ${{ github.event_name == 'push' }} @@ -70,19 +84,25 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: - python-version: "3.11" - - run: | + python-version: "3.12" + + - name: Install coverage dependencies + run: | pip install -U pip uv uv pip install --system coverage[toml] uv pip install --system --no-deps . + - uses: actions/download-artifact@v3 - - name: run coverage + + - name: Run coverage run: | coverage combine coverage--*/.coverage* coverage report --fail-under=0 coverage xml + - uses: codecov/codecov-action@v2 with: token: ${{ secrets.CODECOV_TOKEN }} From 67252acf9bdfc84ae2239551f6ecb855f4b7dfcd Mon Sep 17 00:00:00 2001 From: zethson Date: Fri, 15 Nov 2024 11:00:37 +0100 Subject: [PATCH 09/10] :bug: Only configure AWS credentials if not from fork Signed-off-by: zethson --- .github/workflows/build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b081427..3b738fb 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -49,6 +49,7 @@ jobs: uv pip install --system ipywidgets - uses: aws-actions/configure-aws-credentials@v4 + if: ${{ github.event.pull_request.head.repo.full_name == github.repository }} with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} From 436954d5a24bc9523a7846bab7b00f1e45b840b6 Mon Sep 17 00:00:00 2001 From: zethson Date: Fri, 15 Nov 2024 11:15:58 +0100 Subject: [PATCH 10/10] :art: Pre-commit Signed-off-by: zethson --- bionty/base/entities/_gene.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bionty/base/entities/_gene.py b/bionty/base/entities/_gene.py index 7a5fc01..dda60c7 100644 --- a/bionty/base/entities/_gene.py +++ b/bionty/base/entities/_gene.py @@ -232,7 +232,9 @@ def add_external_db_column(df: pd.DataFrame, ext_db: str, df_col: str): # if stable_id is not ensembl_gene_id, keep a stable_id column if not all(df_res["ensembl_gene_id"].str.startswith("ENS")): - logger.warning("ensembl_gene_id column not all ENS-prefixed, writing to stable_id column.") + logger.warning( + "ensembl_gene_id column not all ENS-prefixed, writing to stable_id column." + ) df_res.insert(0, "stable_id", df_res.pop("ensembl_gene_id")) df_res = df_res.sort_values("stable_id").reset_index(drop=True) else: