Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add interval logic for l2g features #812

Open
wants to merge 43 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
9c31f43
feat: add interval logic for l2g features
xyg123 Oct 3, 2024
330b79e
chore: fix docstrings
xyg123 Oct 3, 2024
183c827
chore: fix attribute errors
xyg123 Oct 3, 2024
500bae8
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Oct 3, 2024
7cb4b5f
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Oct 7, 2024
2035a52
fix: multiple input lines from merge
xyg123 Oct 7, 2024
985a901
fix: change to mean comparison, add additional interval features
xyg123 Oct 7, 2024
b01b4e8
fix: change to mean comparison, add additional interval features
xyg123 Oct 7, 2024
688c73a
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Oct 7, 2024
6837df3
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Oct 15, 2024
f194098
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Oct 16, 2024
a9c0f6b
fix: change interval schema, reorganise interval processing, begin ad…
xyg123 Oct 17, 2024
63d6db6
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Oct 17, 2024
374a7c3
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Oct 18, 2024
29ad08b
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Oct 21, 2024
42e4ce9
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Nov 21, 2024
55f947f
fix: schema fixes
xyg123 Nov 22, 2024
1de5fcf
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Dec 10, 2024
c332d93
Added working tests for interval + nbh features
xyg123 Dec 11, 2024
ee8c4f2
chore: pre-commit auto fixes [...]
pre-commit-ci[bot] Dec 11, 2024
737a827
fix: l2g_feature_matrix tests
xyg123 Dec 11, 2024
921c820
Merge branch 'xg1_l2g_intervals' of https://github.com/opentargets/ge…
xyg123 Dec 11, 2024
0e23427
chore: pre-commit auto fixes [...]
pre-commit-ci[bot] Dec 11, 2024
6ac2d12
fix l2g_feature_matrix tests
xyg123 Dec 11, 2024
b1b2aa5
Merge branch 'xg1_l2g_intervals' of https://github.com/opentargets/ge…
xyg123 Dec 11, 2024
4f893fb
fix l2g_feature_matrix tests
xyg123 Dec 11, 2024
2bbf69c
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Dec 16, 2024
aed12ec
fix l2g step for intervals
xyg123 Dec 17, 2024
37109e3
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Dec 17, 2024
054eaa3
chore: pre-commit auto fixes [...]
pre-commit-ci[bot] Dec 17, 2024
ad934c4
generate features by overlapping studyLocus variants
xyg123 Dec 17, 2024
0eea3aa
Merge branch 'xg1_l2g_intervals' of https://github.com/opentargets/ge…
xyg123 Dec 17, 2024
8140d5a
fix on l2g step mypy
xyg123 Dec 17, 2024
24dc8c3
type hint issue
xyg123 Dec 17, 2024
9aeb302
add datasource step to process intervals
xyg123 Dec 17, 2024
155fcdb
chore: pre-commit auto fixes [...]
pre-commit-ci[bot] Dec 17, 2024
53a6ff3
add interval doc .md
xyg123 Dec 19, 2024
b8914a7
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Dec 19, 2024
78f661b
changes to config
xyg123 Dec 19, 2024
cf8b260
Merge branch 'xg1_l2g_intervals' of https://github.com/opentargets/ge…
xyg123 Dec 19, 2024
880cacf
chore: pre-commit auto fixes [...]
pre-commit-ci[bot] Dec 19, 2024
c076e17
address feature name comments and tests
xyg123 Dec 19, 2024
b074bc4
Merge branch 'xg1_l2g_intervals' of https://github.com/opentargets/ge…
xyg123 Dec 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/gentropy/config.py
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ class LocusToGeneConfig(StepConfig):
wandb_run_name: str | None = None
hf_hub_repo_id: str | None = "opentargets/locus_to_gene"
download_from_hub: bool = True
#interval_sources = dict[str, str] | None = ["javierre": "gs://genetics_etl_python_playground/static_assets/javierre_2016_preprocessed", "thurman": "gs://genetics_etl_python_playground/static_assets/thurman_2012/genomewideCorrs_above0.7_promoterPlusMinus500kb_withGeneNames_32celltypeCategories.bed8.gz", "anderson":"gs://genetics_etl_python_playground/static_assets/andersson2014/enhancer_tss_associations.bed"]
write_feature_matrix: bool = True
_target_: str = "gentropy.l2g.LocusToGeneStep"

Expand Down
186 changes: 186 additions & 0 deletions src/gentropy/dataset/l2g_features/intervals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
"""Collection of methods that extract features from the interval datasets."""

from __future__ import annotations

from typing import TYPE_CHECKING, Any

import pyspark.sql.functions as f
from pyspark.sql import Window

from gentropy.common.spark_helpers import convert_from_wide_to_long

# from gentropy.dataset.colocalisation import Colocalisation
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
from gentropy.dataset.intervals import Intervals
from gentropy.dataset.l2g_features.l2g_feature import L2GFeature
from gentropy.dataset.l2g_gold_standard import L2GGoldStandard

# from gentropy.dataset.study_index import StudyIndex
from gentropy.dataset.study_locus import StudyLocus

if TYPE_CHECKING:
from pyspark.sql import DataFrame


def common_interval_feature_logic(
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
*,
intervals: Intervals,
feature_name: str,
interval_source: str,
) -> DataFrame:
"""Computes the feature.

Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci
that will be used for annotation
intervals (Intervals): The dataset containing interval information
feature_name (str): The name of the feature
interval_source (str): The datasource of the interval input

Returns:
DataFrame: Feature dataset
"""
# Only implementing mean average interval features.
agg_expr = f.mean(f.col("weightedIntervalScore"))
return (
study_loci_to_annotate.df.withColumn("variantInLocus", f.explode_outer("locus"))
.select(
"studyLocusId",
f.col("variantInLocus.variantId").alias("variantInLocusId"),
f.col("variantInLocus.posteriorProbability").alias(
"variantInLocusPosteriorProbability"
),
)
.join(
intervals.df.filter(f.col("datasourceId") == interval_source)
.withColumnRenamed("variantId", "variantInLocusId")
.withColumnRenamed("targetId", "geneId"),
on=["variantInLocusId", "geneId"],
how="inner",
)
.withColumn(
"weightedIntervalScore",
f.col("resourceScore") * f.col("variantInLocusPosteriorProbability"),
)
.groupBy("studyLocusId", "geneId")
.agg(agg_expr.alias(feature_name))
)


def common_neighbourhood_interval_feature_logic(
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
*,
intervals: Intervals,
feature_name: str,
interval_source: str,
) -> DataFrame:
"""Computes the feature.

Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
intervals (Intervals): The dataset containing interval information
feature_name (str): The name of the feature
interval_source (str): The datasource of the interval input

Returns:
DataFrame: Feature dataset
"""
local_feature_name = feature_name.replace("Neighbourhood", "")
# First compute mean distances to a gene
local_max = common_interval_feature_logic(
study_loci_to_annotate,
feature_name=local_feature_name,
intervals=intervals,
interval_source=interval_source,
)
return (
# Then compute the max score in the vicinity (
# feature will be the same for any gene associated with a studyLocus)
local_max.withColumn(
"regional_maximum",
f.max(local_feature_name).over(Window.partitionBy("studyLocusId")),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is it maximum? According to the table and what we discussed it should be mean?
https://docs.google.com/spreadsheets/d/1wUs1AprRCCGItZmgDhc1fF5BtwCSosdzFv4NQ8V6Dtg/edit?gid=452826388#gid=452826388

)
.withColumn(feature_name, f.col(local_feature_name) - f.col("regional_maximum"))
.drop("regional_maximum")
)


class PchicMeanFeature(L2GFeature):
"""Average weighted CHiCAGO scores from studylocus to gene TSS."""
xyg123 marked this conversation as resolved.
Show resolved Hide resolved

fill_na_value = 0 # would be 0 if implemented
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
feature_dependency_type = Intervals
feature_name = "pchicMean"

@classmethod
def compute(
cls: type[PchicMeanFeature],
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
feature_dependency: dict[str, Any],
) -> PchicMeanFeature:
"""Computes the feature.

Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
feature_dependency (dict[str, Any]): Dataset that contains the distance information

Returns:
PchicMeanFeature: Feature dataset
"""
interval_source = "javierre2016"
return cls(
_df=convert_from_wide_to_long(
common_interval_feature_logic(
study_loci_to_annotate,
feature_name=cls.feature_name,
interval_source=interval_source,
**feature_dependency,
),
id_vars=("studyLocusId", "geneId"),
var_name="featureName",
value_name="featureValue",
),
_schema=cls.get_schema(),
)


class PchicMeanNeighbourhoodFeature(L2GFeature):
"""Average weighted CHiCAGO scores from studylocus to gene TSS.
xyg123 marked this conversation as resolved.
Show resolved Hide resolved

Proportional to strongest weighted CHiCAGO scores for all genes in the vicinity.
"""

fill_na_value = 0 # would be 0 if implemented
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
feature_dependency_type = Intervals
feature_name = "pchicMeanNeighbourhood"

@classmethod
def compute(
cls: type[PchicMeanNeighbourhoodFeature],
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
feature_dependency: dict[str, Any],
) -> PchicMeanNeighbourhoodFeature:
"""Computes the feature.

Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
feature_dependency (dict[str, Any]): Dataset that contains the distance information

Returns:
PchicMeanNeighbourhoodFeature: Feature dataset
"""
interval_source = "javierre2016"
return cls(
_df=convert_from_wide_to_long(
common_neighbourhood_interval_feature_logic(
study_loci_to_annotate,
feature_name=cls.feature_name,
interval_source=interval_source,
**feature_dependency,
),
id_vars=("studyLocusId", "geneId"),
var_name="featureName",
value_name="featureValue",
),
_schema=cls.get_schema(),
)
57 changes: 57 additions & 0 deletions src/gentropy/l2g.py
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,20 @@

from __future__ import annotations

from functools import reduce
from typing import Any

import pyspark.sql.functions as f
from sklearn.ensemble import GradientBoostingClassifier
from wandb import login as wandb_login

from gentropy.common.Liftover import LiftOverSpark
from gentropy.common.session import Session
from gentropy.common.utils import access_gcp_secret
from gentropy.config import LocusToGeneConfig
from gentropy.dataset.colocalisation import Colocalisation
from gentropy.dataset.gene_index import GeneIndex
from gentropy.dataset.intervals import Intervals
from gentropy.dataset.l2g_feature_matrix import L2GFeatureMatrix
from gentropy.dataset.l2g_gold_standard import L2GGoldStandard
from gentropy.dataset.l2g_prediction import L2GPrediction
Expand Down Expand Up @@ -44,6 +47,9 @@ def __init__(
study_index_path: str | None = None,
gene_index_path: str | None = None,
gene_interactions_path: str | None = None,
interval_path: dict[str, str] | None = None,
liftover_chain_file_path: str | None = None,
liftover_max_length_difference: int = 100,
predictions_path: str | None = None,
feature_matrix_path: str | None = None,
write_feature_matrix: bool,
Expand All @@ -66,6 +72,9 @@ def __init__(
study_index_path (str | None): Path to the study index dataset
gene_index_path (str | None): Path to the gene index dataset
gene_interactions_path (str | None): Path to the gene interactions dataset
interval_path (dict[str, str] | None) : Path and source of interval input datasets
liftover_chain_file_path (str | None) : Path to the liftover chain file
liftover_max_length_difference (int) : Maximum allowed difference for liftover
predictions_path (str | None): Path to the L2G predictions output dataset
feature_matrix_path (str | None): Path to the L2G feature matrix output dataset
write_feature_matrix (bool): Whether to write the full feature matrix to the filesystem
Expand Down Expand Up @@ -104,6 +113,54 @@ def __init__(
if variant_index_path
else None
)
self.gene_index = (
GeneIndex.from_parquet(session, gene_index_path)
if gene_index_path
else None
)
self.lift = (
LiftOverSpark(
liftover_chain_file_path,
liftover_max_length_difference,
)
if liftover_chain_file_path
else None
)

if self.variant_index and self.gene_index and self.lift and interval_path:
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
self.intervals = Intervals(
_df=reduce(
lambda x, y: x.unionByName(y, allowMissingColumns=True),
# create interval instances by parsing each source
[
Intervals.from_source(
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
session.spark,
source_name,
source_path,
self.gene_index,
self.lift,
).df
for source_name, source_path in interval_path.items()
],
)
.alias("interval")
.join(
self.variant_index.df.selectExpr(
"chromosome as vi_chromosome", "variantId", "position"
).alias("vi"),
on=[
f.col("vi.vi_chromosome") == f.col("interval.chromosome"),
f.col("vi.position").between(
f.col("interval.start"), f.col("interval.end")
),
],
how="inner",
)
.drop("start", "end", "vi_chromosome", "position"),
_schema=Intervals.get_schema(),
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
)
else:
raise ValueError("variant_index is None, cannot join with intervals.")
self.coloc = (
Colocalisation.from_parquet(
session, colocalisation_path, recursiveFileLookup=True
Expand Down
6 changes: 6 additions & 0 deletions src/gentropy/method/l2g/feature_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@
DistanceTssMeanFeature,
DistanceTssMeanNeighbourhoodFeature,
)
from gentropy.dataset.l2g_features.intervals import (
PchicMeanFeature,
PchicMeanNeighbourhoodFeature,
)
from gentropy.dataset.l2g_features.l2g_feature import L2GFeature
from gentropy.dataset.l2g_features.vep import (
VepMaximumFeature,
Expand Down Expand Up @@ -127,6 +131,8 @@ class FeatureFactory:
"vepMeanNeighbourhood": VepMeanNeighbourhoodFeature,
"vepMaximum": VepMaximumFeature,
"vepMaximumNeighbourhood": VepMaximumNeighbourhoodFeature,
"pchicMean": PchicMeanFeature,
"pchicMeanNeighbourhood": PchicMeanNeighbourhoodFeature,
}

def __init__(
Expand Down
Loading