opentargets · xyg123 · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024
diff --git a/src/gentropy/config.py b/src/gentropy/config.py
@@ -282,6 +282,7 @@ class LocusToGeneConfig(StepConfig):
     wandb_run_name: str | None = None
     hf_hub_repo_id: str | None = "opentargets/locus_to_gene"
     download_from_hub: bool = True
+    #interval_sources = dict[str, str] | None = ["javierre": "gs://genetics_etl_python_playground/static_assets/javierre_2016_preprocessed", "thurman": "gs://genetics_etl_python_playground/static_assets/thurman_2012/genomewideCorrs_above0.7_promoterPlusMinus500kb_withGeneNames_32celltypeCategories.bed8.gz", "anderson":"gs://genetics_etl_python_playground/static_assets/andersson2014/enhancer_tss_associations.bed"]
     write_feature_matrix: bool = True
     _target_: str = "gentropy.l2g.LocusToGeneStep"
 

diff --git a/src/gentropy/dataset/l2g_features/intervals.py b/src/gentropy/dataset/l2g_features/intervals.py
@@ -0,0 +1,186 @@
+"""Collection of methods that extract features from the interval datasets."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import pyspark.sql.functions as f
+from pyspark.sql import Window
+
+from gentropy.common.spark_helpers import convert_from_wide_to_long
+
+# from gentropy.dataset.colocalisation import Colocalisation
+from gentropy.dataset.intervals import Intervals
+from gentropy.dataset.l2g_features.l2g_feature import L2GFeature
+from gentropy.dataset.l2g_gold_standard import L2GGoldStandard
+
+# from gentropy.dataset.study_index import StudyIndex
+from gentropy.dataset.study_locus import StudyLocus
+
+if TYPE_CHECKING:
+    from pyspark.sql import DataFrame
+
+
+def common_interval_feature_logic(
+    study_loci_to_annotate: StudyLocus | L2GGoldStandard,
+    *,
+    intervals: Intervals,
+    feature_name: str,
+    interval_source: str,
+) -> DataFrame:
+    """Computes the feature.
+
+    Args:
+        study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci
+            that will be used for annotation
+        intervals (Intervals): The dataset containing interval information
+        feature_name (str): The name of the feature
+        interval_source (str): The datasource of the interval input
+
+    Returns:
+            DataFrame: Feature dataset
+    """
+    # Only implementing mean average interval features.
+    agg_expr = f.mean(f.col("weightedIntervalScore"))
+    return (
+        study_loci_to_annotate.df.withColumn("variantInLocus", f.explode_outer("locus"))
+        .select(
+            "studyLocusId",
+            f.col("variantInLocus.variantId").alias("variantInLocusId"),
+            f.col("variantInLocus.posteriorProbability").alias(
+                "variantInLocusPosteriorProbability"
+            ),
+        )
+        .join(
+            intervals.df.filter(f.col("datasourceId") == interval_source)
+            .withColumnRenamed("variantId", "variantInLocusId")
+            .withColumnRenamed("targetId", "geneId"),
+            on=["variantInLocusId", "geneId"],
+            how="inner",
+        )
+        .withColumn(
+            "weightedIntervalScore",
+            f.col("resourceScore") * f.col("variantInLocusPosteriorProbability"),
+        )
+        .groupBy("studyLocusId", "geneId")
+        .agg(agg_expr.alias(feature_name))
+    )
+
+
+def common_neighbourhood_interval_feature_logic(
+    study_loci_to_annotate: StudyLocus | L2GGoldStandard,
+    *,
+    intervals: Intervals,
+    feature_name: str,
+    interval_source: str,
+) -> DataFrame:
+    """Computes the feature.
+
+    Args:
+        study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
+        intervals (Intervals): The dataset containing interval information
+        feature_name (str): The name of the feature
+        interval_source (str): The datasource of the interval input
+
+    Returns:
+            DataFrame: Feature dataset
+    """
+    local_feature_name = feature_name.replace("Neighbourhood", "")
+    # First compute mean distances to a gene
+    local_max = common_interval_feature_logic(
+        study_loci_to_annotate,
+        feature_name=local_feature_name,
+        intervals=intervals,
+        interval_source=interval_source,
+    )
+    return (
+        # Then compute the max score in the vicinity (
+        # feature will be the same for any gene associated with a studyLocus)
+        local_max.withColumn(
+            "regional_maximum",
+            f.max(local_feature_name).over(Window.partitionBy("studyLocusId")),
+        )
+        .withColumn(feature_name, f.col(local_feature_name) - f.col("regional_maximum"))
+        .drop("regional_maximum")
+    )
+
+
+class PchicMeanFeature(L2GFeature):
+    """Average weighted CHiCAGO scores from studylocus to gene TSS."""
+
+    fill_na_value = 0  # would be 0 if implemented
+    feature_dependency_type = Intervals
+    feature_name = "pchicMean"
+
+    @classmethod
+    def compute(
+        cls: type[PchicMeanFeature],
+        study_loci_to_annotate: StudyLocus | L2GGoldStandard,
+        feature_dependency: dict[str, Any],
+    ) -> PchicMeanFeature:
+        """Computes the feature.
+
+        Args:
+            study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
+            feature_dependency (dict[str, Any]): Dataset that contains the distance information
+
+        Returns:
+            PchicMeanFeature: Feature dataset
+        """
+        interval_source = "javierre2016"
+        return cls(
+            _df=convert_from_wide_to_long(
+                common_interval_feature_logic(
+                    study_loci_to_annotate,
+                    feature_name=cls.feature_name,
+                    interval_source=interval_source,
+                    **feature_dependency,
+                ),
+                id_vars=("studyLocusId", "geneId"),
+                var_name="featureName",
+                value_name="featureValue",
+            ),
+            _schema=cls.get_schema(),
+        )
+
+
+class PchicMeanNeighbourhoodFeature(L2GFeature):
+    """Average weighted CHiCAGO scores from studylocus to gene TSS.
+
+    Proportional to strongest weighted CHiCAGO scores for all genes in the vicinity.
+    """
+
+    fill_na_value = 0  # would be 0 if implemented
+    feature_dependency_type = Intervals
+    feature_name = "pchicMeanNeighbourhood"
+
+    @classmethod
+    def compute(
+        cls: type[PchicMeanNeighbourhoodFeature],
+        study_loci_to_annotate: StudyLocus | L2GGoldStandard,
+        feature_dependency: dict[str, Any],
+    ) -> PchicMeanNeighbourhoodFeature:
+        """Computes the feature.
+
+        Args:
+            study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
+            feature_dependency (dict[str, Any]): Dataset that contains the distance information
+
+        Returns:
+            PchicMeanNeighbourhoodFeature: Feature dataset
+        """
+        interval_source = "javierre2016"
+        return cls(
+            _df=convert_from_wide_to_long(
+                common_neighbourhood_interval_feature_logic(
+                    study_loci_to_annotate,
+                    feature_name=cls.feature_name,
+                    interval_source=interval_source,
+                    **feature_dependency,
+                ),
+                id_vars=("studyLocusId", "geneId"),
+                var_name="featureName",
+                value_name="featureValue",
+            ),
+            _schema=cls.get_schema(),
+        )
diff --git a/src/gentropy/l2g.py b/src/gentropy/l2g.py
@@ -2,17 +2,20 @@
 
 from __future__ import annotations
 
+from functools import reduce
 from typing import Any
 
 import pyspark.sql.functions as f
 from sklearn.ensemble import GradientBoostingClassifier
 from wandb import login as wandb_login
 
+from gentropy.common.Liftover import LiftOverSpark
 from gentropy.common.session import Session
 from gentropy.common.utils import access_gcp_secret
 from gentropy.config import LocusToGeneConfig
 from gentropy.dataset.colocalisation import Colocalisation
 from gentropy.dataset.gene_index import GeneIndex
+from gentropy.dataset.intervals import Intervals
 from gentropy.dataset.l2g_feature_matrix import L2GFeatureMatrix
 from gentropy.dataset.l2g_gold_standard import L2GGoldStandard
 from gentropy.dataset.l2g_prediction import L2GPrediction
@@ -44,6 +47,9 @@ def __init__(
         study_index_path: str | None = None,
         gene_index_path: str | None = None,
         gene_interactions_path: str | None = None,
+        interval_path: dict[str, str] | None = None,
+        liftover_chain_file_path: str | None = None,
+        liftover_max_length_difference: int = 100,
         predictions_path: str | None = None,
         feature_matrix_path: str | None = None,
         write_feature_matrix: bool,
@@ -66,6 +72,9 @@ def __init__(
             study_index_path (str | None): Path to the study index dataset
             gene_index_path (str | None): Path to the gene index dataset
             gene_interactions_path (str | None): Path to the gene interactions dataset
+            interval_path (dict[str, str] | None) : Path and source of interval input datasets
+            liftover_chain_file_path (str | None) : Path to the liftover chain file
+            liftover_max_length_difference (int) : Maximum allowed difference for liftover
             predictions_path (str | None): Path to the L2G predictions output dataset
             feature_matrix_path (str | None): Path to the L2G feature matrix output dataset
             write_feature_matrix (bool): Whether to write the full feature matrix to the filesystem
@@ -104,6 +113,54 @@ def __init__(
             if variant_index_path
             else None
         )
+        self.gene_index = (
+            GeneIndex.from_parquet(session, gene_index_path)
+            if gene_index_path
+            else None
+        )
+        self.lift = (
+            LiftOverSpark(
+                liftover_chain_file_path,
+                liftover_max_length_difference,
+            )
+            if liftover_chain_file_path
+            else None
+        )
+
+        if self.variant_index and self.gene_index and self.lift and interval_path:
+            self.intervals = Intervals(
+                _df=reduce(
+                    lambda x, y: x.unionByName(y, allowMissingColumns=True),
+                    # create interval instances by parsing each source
+                    [
+                        Intervals.from_source(
+                            session.spark,
+                            source_name,
+                            source_path,
+                            self.gene_index,
+                            self.lift,
+                        ).df
+                        for source_name, source_path in interval_path.items()
+                    ],
+                )
+                .alias("interval")
+                .join(
+                    self.variant_index.df.selectExpr(
+                        "chromosome as vi_chromosome", "variantId", "position"
+                    ).alias("vi"),
+                    on=[
+                        f.col("vi.vi_chromosome") == f.col("interval.chromosome"),
+                        f.col("vi.position").between(
+                            f.col("interval.start"), f.col("interval.end")
+                        ),
+                    ],
+                    how="inner",
+                )
+                .drop("start", "end", "vi_chromosome", "position"),
+                _schema=Intervals.get_schema(),
+            )
+        else:
+            raise ValueError("variant_index is None, cannot join with intervals.")
         self.coloc = (
             Colocalisation.from_parquet(
                 session, colocalisation_path, recursiveFileLookup=True

diff --git a/src/gentropy/method/l2g/feature_factory.py b/src/gentropy/method/l2g/feature_factory.py
@@ -32,6 +32,10 @@
     DistanceTssMeanFeature,
     DistanceTssMeanNeighbourhoodFeature,
 )
+from gentropy.dataset.l2g_features.intervals import (
+    PchicMeanFeature,
+    PchicMeanNeighbourhoodFeature,
+)
 from gentropy.dataset.l2g_features.l2g_feature import L2GFeature
 from gentropy.dataset.l2g_features.vep import (
     VepMaximumFeature,
@@ -127,6 +131,8 @@ class FeatureFactory:
         "vepMeanNeighbourhood": VepMeanNeighbourhoodFeature,
         "vepMaximum": VepMaximumFeature,
         "vepMaximumNeighbourhood": VepMaximumNeighbourhoodFeature,
+        "pchicMean": PchicMeanFeature,
+        "pchicMeanNeighbourhood": PchicMeanNeighbourhoodFeature,
     }
 
     def __init__(