From 4820e655fa6715dae5c6d0e9af81c86b3b4bbd10 Mon Sep 17 00:00:00 2001 From: Szymon Szyszkowski <69353402+project-defiant@users.noreply.github.com> Date: Mon, 16 Dec 2024 13:11:26 +0100 Subject: [PATCH] refactor: drop version_engine (#940) * refactor: drop use_version_from_input flag * chore: drop version engine --------- Co-authored-by: project-defiant Co-authored-by: Szymon Szyszkowski Co-authored-by: Daniel Suveges --- docs/python_api/common/version_engine.md | 12 -- src/gentropy/common/version_engine.py | 158 ------------------- src/gentropy/config.py | 30 ++-- src/gentropy/gnomad_ingestion.py | 22 +-- tests/gentropy/common/test_version_engine.py | 115 -------------- 5 files changed, 20 insertions(+), 317 deletions(-) delete mode 100644 docs/python_api/common/version_engine.md delete mode 100644 src/gentropy/common/version_engine.py delete mode 100644 tests/gentropy/common/test_version_engine.py diff --git a/docs/python_api/common/version_engine.md b/docs/python_api/common/version_engine.md deleted file mode 100644 index 28d9b4b2e..000000000 --- a/docs/python_api/common/version_engine.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -title: VersionEngine ---- - -**VersionEngine**: - -Version engine allows for registering datasource specific version seeker class to retrieve datasource version used as input to gentropy steps. Currently implemented only for GnomAD datasource. - -This class can be then used to produce automation over output directory versioning. - -:::gentropy.common.version_engine.VersionEngine -:::gentropy.common.version_engine.GnomADVersionSeeker diff --git a/src/gentropy/common/version_engine.py b/src/gentropy/common/version_engine.py deleted file mode 100644 index d852d8f5d..000000000 --- a/src/gentropy/common/version_engine.py +++ /dev/null @@ -1,158 +0,0 @@ -"""Mechanism to seek version from specific datasource.""" - -from __future__ import annotations - -import re -from abc import ABC, abstractmethod -from pathlib import Path -from typing import Callable - -from gentropy.common.types import DataSourceType - - -class VersionEngine: - """Seek version from the datasource.""" - - def __init__(self, datasource: DataSourceType) -> None: - """Initialize VersionEngine. - - Args: - datasource (DataSourceType): datasource to seek the version from - """ - self.datasource = datasource - - @staticmethod - def version_seekers() -> dict[DataSourceType, DatasourceVersionSeeker]: - """List version seekers. - - Returns: - dict[DataSourceType, DatasourceVersionSeeker]: list of available data sources. - """ - return { - "gnomad": GnomADVersionSeeker(), - } - - def seek(self, text: str | Path) -> str: - """Interface for inferring the version from text by using registered data source version iner method. - - Args: - text (str | Path): text to seek version from - - Returns: - str: inferred version - - Raises: - TypeError: if version can not be found in the text - - Examples: - >>> VersionEngine("gnomad").seek("gs://gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.vcf.bgz") - '2.1.1' - """ - match text: - case Path() | str(): - text = str(text) - case _: - msg = f"Can not find version in {text}" - raise TypeError(msg) - infer_method = self._get_version_seek_method() - return infer_method(text) - - def _get_version_seek_method(self) -> Callable[[str], str]: - """Method that gets the version seeker for the datasource. - - Returns: - Callable[[str], str]: Method to seek version based on the initialized datasource - - Raises: - ValueError: if datasource is not registered in the list of version seekers - """ - if self.datasource not in self.version_seekers(): - raise ValueError(f"Invalid datasource {self.datasource}") - return self.version_seekers()[self.datasource].seek_version - - def amend_version( - self, analysis_input_path: str | Path, analysis_output_path: str | Path - ) -> str: - """Amend version to the analysis output path if it is not already present. - - Path can be path to g3:// or Path object, absolute or relative. - The analysis_input_path has to contain the version number. - If the analysis_output_path contains the same version as inferred from input version already, - then it will not be appended. - - Args: - analysis_input_path (str | Path): step input path - analysis_output_path (str | Path): step output path - - Returns: - str: Path with the ammended version, does not return Path object! - - Examples: - >>> VersionEngine("gnomad").amend_version("gs://gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.vcf.bgz", "/some/path/without/version") - '/some/path/without/version/2.1.1' - """ - version = self.seek(analysis_input_path) - output_path = str(analysis_output_path) - if version in output_path: - return output_path - if output_path.endswith("/"): - return f"{analysis_output_path}{version}" - return f"{analysis_output_path}/{version}" - - -class DatasourceVersionSeeker(ABC): - """Interface for datasource version seeker. - - Raises: - NotImplementedError: if method is not implemented in the subclass - """ - - @staticmethod - @abstractmethod - def seek_version(text: str) -> str: - """Seek version from text. Implement this method in the subclass. - - Args: - text (str): text to seek version from - - Returns: - str: seeked version - - Raises: - NotImplementedError: if method is not implemented in the subclass - - """ - raise NotImplementedError - - -class GnomADVersionSeeker(DatasourceVersionSeeker): - """Seek version from GnomAD datasource.""" - - @staticmethod - def seek_version(text: str) -> str: - """Seek GnomAD version from provided text by using regex. - - Up to 3 digits are allowed in the version number. - Historically gnomAD version numbers have been in the format - 2.1.1, 3.1, etc. as of 2024-05. GnomAD versions can be found by - running `"gs://gcp-public-data--gnomad/release/*/*/*"` - - Args: - text (str): text to seek version from - - Raises: - ValueError: if version can not be seeked - - Returns: - str: seeked version - - Examples: - >>> GnomADVersionSeeker.seek_version("gs://gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.vcf.bgz") - '2.1.1' - """ - result = re.search(r"v?((\d+){1}\.(\d+){1}\.?(\d+)?)", text) - match result: - case None: - raise ValueError(f"No GnomAD version found in provided text: {text}") - case _: - return result.group(1) diff --git a/src/gentropy/config.py b/src/gentropy/config.py index 9c454d41b..0d9d6abf1 100644 --- a/src/gentropy/config.py +++ b/src/gentropy/config.py @@ -17,7 +17,8 @@ class SessionConfig: write_mode: str = "errorifexists" spark_uri: str = "local[*]" hail_home: str = os.path.dirname(hail_location) - extended_spark_conf: dict[str, str] | None = field(default_factory=dict[str, str]) + extended_spark_conf: dict[str, str] | None = field( + default_factory=dict[str, str]) output_partitions: int = 200 _target_: str = "gentropy.common.session.Session" @@ -39,7 +40,8 @@ class ColocalisationConfig(StepConfig): credible_set_path: str = MISSING coloc_path: str = MISSING colocalisation_method: str = MISSING - colocalisation_method_params: dict[str, Any] = field(default_factory=dict[str, Any]) + colocalisation_method_params: dict[str, Any] = field( + default_factory=dict[str, Any]) _target_: str = "gentropy.colocalisation.ColocalisationStep" @@ -124,7 +126,8 @@ class EqtlCatalogueConfig(StepConfig): eqtl_catalogue_paths_imported: str = MISSING eqtl_catalogue_study_index_out: str = MISSING eqtl_catalogue_credible_sets_out: str = MISSING - mqtl_quantification_methods_blacklist: list[str] = field(default_factory=lambda: []) + mqtl_quantification_methods_blacklist: list[str] = field( + default_factory=lambda: []) eqtl_lead_pvalue_threshold: float = 1e-3 _target_: str = "gentropy.eqtl_catalogue.EqtlCatalogueStep" @@ -146,7 +149,8 @@ class FinngenStudiesConfig(StepConfig): ) finngen_summary_stats_url_suffix: str = ".gz" efo_curation_mapping_url: str = "https://raw.githubusercontent.com/opentargets/curation/24.09.1/mappings/disease/manual_string.tsv" - sample_size: int = 453733 # https://www.finngen.fi/en/access_results#:~:text=Total%20sample%20size%3A%C2%A0453%2C733%C2%A0(254%2C618%C2%A0females%20and%C2%A0199%2C115%20males) + # https://www.finngen.fi/en/access_results#:~:text=Total%20sample%20size%3A%C2%A0453%2C733%C2%A0(254%2C618%C2%A0females%20and%C2%A0199%2C115%20males) + sample_size: int = 453733 _target_: str = "gentropy.finngen_studies.FinnGenStudiesStep" @@ -199,7 +203,6 @@ class LDIndexConfig(StepConfig): "nfe", # Non-Finnish European ] ) - use_version_from_input: bool = False _target_: str = "gentropy.gnomad_ingestion.LDIndexStep" @@ -409,7 +412,6 @@ class GnomadVariantConfig(StepConfig): "remaining", # Other ] ) - use_version_from_input: bool = False _target_: str = "gentropy.gnomad_ingestion.GnomadVariantIndexStep" @@ -432,7 +434,6 @@ class PanUKBBConfig(StepConfig): "EUR", # European ] ) - use_version_from_input: bool = False _target_: str = "gentropy.pan_ukb_ingestion.PanUKBBVariantIndexStep" @@ -680,7 +681,8 @@ class Config: """Application configuration.""" # this is unfortunately verbose due to @dataclass limitations - defaults: List[Any] = field(default_factory=lambda: ["_self_", {"step": MISSING}]) + defaults: List[Any] = field(default_factory=lambda: [ + "_self_", {"step": MISSING}]) step: StepConfig = MISSING datasets: dict[str, str] = field(default_factory=dict) @@ -714,7 +716,8 @@ def register_config() -> None: name="gwas_catalog_top_hit_ingestion", node=GWASCatalogTopHitIngestionConfig, ) - cs.store(group="step", name="ld_based_clumping", node=LDBasedClumpingConfig) + cs.store(group="step", name="ld_based_clumping", + node=LDBasedClumpingConfig) cs.store(group="step", name="ld_index", node=LDIndexConfig) cs.store(group="step", name="locus_to_gene", node=LocusToGeneConfig) cs.store( @@ -732,7 +735,8 @@ def register_config() -> None: cs.store(group="step", name="pics", node=PICSConfig) cs.store(group="step", name="gnomad_variants", node=GnomadVariantConfig) - cs.store(group="step", name="ukb_ppp_eur_sumstat_preprocess", node=UkbPppEurConfig) + cs.store(group="step", name="ukb_ppp_eur_sumstat_preprocess", + node=UkbPppEurConfig) cs.store(group="step", name="variant_index", node=VariantIndexConfig) cs.store(group="step", name="variant_to_vcf", node=ConvertToVcfStepConfig) cs.store( @@ -765,5 +769,7 @@ def register_config() -> None: name="locus_to_gene_associations", node=LocusToGeneAssociationsStepConfig, ) - cs.store(group="step", name="finngen_ukb_meta_ingestion", node=FinngenUkbMetaConfig) - cs.store(group="step", name="credible_set_qc", node=CredibleSetQCStepConfig) + cs.store(group="step", name="finngen_ukb_meta_ingestion", + node=FinngenUkbMetaConfig) + cs.store(group="step", name="credible_set_qc", + node=CredibleSetQCStepConfig) diff --git a/src/gentropy/gnomad_ingestion.py b/src/gentropy/gnomad_ingestion.py index d930b54c6..e0960c8f7 100644 --- a/src/gentropy/gnomad_ingestion.py +++ b/src/gentropy/gnomad_ingestion.py @@ -4,7 +4,6 @@ from gentropy.common.session import Session from gentropy.common.types import LD_Population, VariantPopulation -from gentropy.common.version_engine import VersionEngine from gentropy.config import GnomadVariantConfig, LDIndexConfig from gentropy.datasource.gnomad.ld import GnomADLDMatrix from gentropy.datasource.gnomad.variants import GnomADVariants @@ -26,10 +25,10 @@ def __init__( min_r2: float = LDIndexConfig().min_r2, ld_matrix_template: str = LDIndexConfig().ld_matrix_template, ld_index_raw_template: str = LDIndexConfig().ld_index_raw_template, - ld_populations: list[LD_Population | str] = LDIndexConfig().ld_populations, + ld_populations: list[LD_Population | + str] = LDIndexConfig().ld_populations, liftover_ht_path: str = LDIndexConfig().liftover_ht_path, grch37_to_grch38_chain_path: str = LDIndexConfig().grch37_to_grch38_chain_path, - use_version_from_input: bool = LDIndexConfig().use_version_from_input, ) -> None: """Run step. @@ -42,17 +41,9 @@ def __init__( ld_populations (list[LD_Population | str]): Population names derived from the ld file paths liftover_ht_path (str): Path to the liftover ht file grch37_to_grch38_chain_path (str): Path to the chain file used to lift over the coordinates. - use_version_from_input (bool): Append version derived from input ld_matrix_template to the output ld_index_out. Defaults to False. - In case use_version_from_input is set to True, - data source version inferred from ld_matrix_temolate is appended as the last path segment to the output path. Default values are provided in LDIndexConfig. """ - if use_version_from_input: - # amend data source version to output path - ld_index_out = VersionEngine("gnomad").amend_version( - ld_matrix_template, ld_index_out - ) ( GnomADLDMatrix( ld_matrix_template=ld_matrix_template, @@ -84,7 +75,6 @@ def __init__( gnomad_variant_populations: list[ VariantPopulation | str ] = GnomadVariantConfig().gnomad_variant_populations, - use_version_from_input: bool = GnomadVariantConfig().use_version_from_input, ) -> None: """Run Variant Annotation step. @@ -93,18 +83,10 @@ def __init__( variant_annotation_path (str): Path to resulting dataset. gnomad_genomes_path (str): Path to gnomAD genomes hail table, e.g. `gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/`. gnomad_variant_populations (list[VariantPopulation | str]): List of populations to include. - use_version_from_input (bool): Append version derived from input gnomad_genomes_path to the output variant_annotation_path. Defaults to False. - In case use_version_from_input is set to True, - data source version inferred from gnomad_genomes_path is appended as the last path segment to the output path. All defaults are stored in the GnomadVariantConfig. """ # amend data source version to output path - if use_version_from_input: - variant_annotation_path = VersionEngine("gnomad").amend_version( - gnomad_genomes_path, variant_annotation_path - ) - session.logger.info("Gnomad variant annotation path:") session.logger.info(variant_annotation_path) # Parse variant info from source. diff --git a/tests/gentropy/common/test_version_engine.py b/tests/gentropy/common/test_version_engine.py deleted file mode 100644 index 2ee2e12ce..000000000 --- a/tests/gentropy/common/test_version_engine.py +++ /dev/null @@ -1,115 +0,0 @@ -"""Tests version engine class.""" - -from __future__ import annotations - -from pathlib import Path - -import pytest - -from gentropy.common.version_engine import GnomADVersionSeeker, VersionEngine - - -@pytest.mark.parametrize( - ["text", "version"], - [ - pytest.param( - "gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.7.vcf", - "2.1.1", - id="GnomAD v2.1.1", - ), - pytest.param( - "/gcp-public-data--gnomad/release/3.0/vcf/genomes/gnomad.genomes.r3.0.sites.chr6.vcf", - "3.0", - id="GnomAD v3.0", - ), - pytest.param( - "gs://gcp-public-data--gnomad/release/3.1.1/vcf/genomes/gnomad.genomes.v3.1.1.sites.chr1.vcf", - "3.1.1", - id="GnomAD v3.1.1", - ), - pytest.param( - "gs://gcp-public-data--gnomad/release/3.1.2/vcf/genomes/gnomad.genomes.v3.1.2.sites.chrY.vcf", - "3.1.2", - id="GnomAD v3.1.2", - ), - pytest.param( - "gsa://gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chrY.vcf", - "4.0", - id="GnomAD v4.0", - ), - pytest.param( - "gs://gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr18.vcf", - "4.1", - id="GnomAD v4.1", - ), - pytest.param( - "/some/path/to/the/version/r20.111.44", - "20.111.44", - id="Extreme version number", - ), - ], -) -def test_extracting_version_with_gnomad_seeker(text: str, version: str) -> None: - """Test gnomad version extraction with GnomADVersionSeeker.""" - version_seeker = GnomADVersionSeeker().seek_version - assert version_seeker(text) == version - - -def test_not_registered_datasource_raises_error() -> None: - """Test that unknown datasource raises error.""" - with pytest.raises(ValueError) as e: - VersionEngine("ClinVar").seek("some/path/to/the/version/v20.111.44") # type: ignore - assert e.value.args[0].startswith("Invalid datasource ClinVar") - - -def test_extracting_version_when_no_version_is_found() -> None: - """Test that unknown datasource raises error.""" - with pytest.raises(ValueError) as e: - VersionEngine("ClinVar").seek("some/path/without/version") # type: ignore - assert e.value.args[0].startswith( - "Can not find version in some/path/without/version" - ) - - -def test_non_string_path_raises_error() -> None: - """Test that non-string path raises error.""" - with pytest.raises(TypeError) as e: - VersionEngine("gnomad").seek(123) # type: ignore - assert e.value.args[0].startswith("Can not infer version from 123") - - -@pytest.mark.parametrize( - ["text", "version"], - [ - pytest.param(Path("some/file/path/v3.1.1"), "3.1.1", id="Path object"), - pytest.param("s3://some/file/path/v3.1.1", "3.1.1", id="S3 protocol"), - pytest.param("gs://some/file/path/v3.1.1", "3.1.1", id="GS protocol"), - ], -) -def test_extracting_version_with_version_engine(text: str | Path, version: str) -> None: - """Check if concrete data types and file protocols does not return an error while passed to VersionEngine.""" - assert VersionEngine("gnomad").seek(text) == version - - -@pytest.mark.parametrize( - ["input_path", "output_path", "expected_output"], - [ - pytest.param( - "input/v20.111.44", "output", "output/20.111.44", id="Append version" - ), - pytest.param( - "input/1.0.0", - "output/1.0.0", - "output/1.0.0", - id="Do not append version, already present", - ), - pytest.param( - Path("input/1.0.0"), Path("output/"), "output/1.0.0", id="Path objects" - ), - ], -) -def test_appending_version_to_path( - input_path: Path | str, output_path: Path | str, expected_output: str -) -> None: - """Test that the version is ammended at the end of the output path.""" - VersionEngine("gnomad").amend_version(input_path, output_path) == expected_output