Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: adding parser for uniprot_variants evidence #214

Draft
wants to merge 11 commits into
base: master
Choose a base branch
from
55 changes: 27 additions & 28 deletions common/evidence.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from __future__ import annotations

import json
import logging
import logging.config
import os
import sys
import tempfile
Expand All @@ -21,38 +21,40 @@
from pyspark.sql import Column, DataFrame


def initialize_logger(
name: str, log_file: Optional[str] = None, log_level: int = logging.INFO
) -> None:
def initialize_logger(name: str, log_file: Optional[str] = None) -> None:
"""Initialize the logger.

Args:
name (str): Name of the logger. This is typically the name of the module. Required to identify the logger.
log_file (str): Path to the log file.
log_level (int): log level eg. logging.INFO, logging.ERROR

Returns:
None
"""
# Setting the format of the log messages:
log_formatter = logging.Formatter(
"%(asctime)s %(levelname)s %(module)s - %(funcName)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
# Initialise logger:
with open(f"{sys.path[0]}/../logger_config.yaml", "r") as stream:
logger_config = yaml.load(stream, Loader=yaml.FullLoader)

logging.config.dictConfig(logger_config)
# # Setting the format of the log messages:
# log_formatter = logging.Formatter(
# "%(asctime)s %(levelname)s %(module)s - %(funcName)s: %(message)s",
# datefmt="%Y-%m-%d %H:%M:%S",
# )

logger = logging.getLogger(name)
logger.setLevel(log_level)
# logger = logging.getLogger(name)
# logger.setLevel(logging.DEBUG)

# Setting up stream handler:
stream_handler = logging.StreamHandler(sys.stderr)
stream_handler.setFormatter(log_formatter)
logger.addHandler(stream_handler)
# # Setting up stream handler:
# stream_handler = logging.StreamHandler(sys.stderr)
# stream_handler.setFormatter(log_formatter)
# logger.addHandler(stream_handler)

# If a log file is provided, add that handler too:
if log_file is not None:
file_handler = logging.FileHandler(log_file, mode="w")
file_handler.setFormatter(log_formatter)
logger.addHandler(file_handler)
# # If a log file is provided, add that handler too:
# if log_file is not None:
# file_handler = logging.FileHandler(log_file, mode="w")
# file_handler.setFormatter(log_formatter)
# logger.addHandler(file_handler)


def detect_spark_memory_limit():
Expand Down Expand Up @@ -93,11 +95,7 @@ def initialize_sparksession() -> SparkSession:
.set("spark.sql.execution.arrow.maxRecordsPerBatch", "500000")
.set("spark.ui.showConsoleProgress", "false")
)
return (
SparkSession.builder.config(conf=spark_conf)
.master("local[*]")
.getOrCreate()
)
return SparkSession.builder.config(conf=spark_conf).master("local[*]").getOrCreate()


class GenerateDiseaseCellLines:
Expand Down Expand Up @@ -346,13 +344,14 @@ def read_ppp_config(config_path: str) -> dict:

return parameters


def apply_bonferroni_correction(n_tests: int) -> float:
"""Multiple test correction based on the number of tests.

Args:
n_tests (int): Number of hypotheses testes assuming they are independent

Returns:
float: new statistical significance level
"""
return 0.05 / n_tests
return 0.05 / n_tests
82 changes: 58 additions & 24 deletions common/ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,18 @@
import os
import random
import time
from typing import Optional
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changes in this file is mostly autoformat.


import pandas as pd
from numpy import nan
from ontoma.interface import OnToma
from pandarallel import pandarallel
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import col, when
from pyspark.sql.types import StringType, StructField, StructType

logger = logging.getLogger(__name__)

ONTOMA_MAX_ATTEMPTS = 3
pandarallel.initialize()

Expand All @@ -22,33 +27,50 @@ def _simple_retry(func, **kwargs):
# If this is not the last attempt, wait until the next one.
if attempt != ONTOMA_MAX_ATTEMPTS:
time.sleep(5 + 10 * random.random())
logging.error(f'OnToma lookup failed for {kwargs!r}')
logging.error(f"OnToma lookup failed for {kwargs!r}")
return []


def _ontoma_udf(row, ontoma_instance):
"""Try to map first by disease name (because that branch of OnToma is more stable), then by disease ID."""
disease_name = None
if row['diseaseFromSource']:
disease_name = ' '.join(row['diseaseFromSource'].replace('obsolete', '').split())
disease_id = row['diseaseFromSourceId'].replace('_', ':') if row['diseaseFromSourceId'] else None
if row["diseaseFromSource"]:
disease_name = " ".join(
row["diseaseFromSource"].replace("obsolete", "").split()
)
disease_id = (
row["diseaseFromSourceId"].replace("_", ":")
if row["diseaseFromSourceId"]
else None
)
mappings = []
if disease_name:
mappings = _simple_retry(ontoma_instance.find_term, query=disease_name, code=False)
if not mappings and disease_id and ':' in disease_id:
mappings = _simple_retry(
ontoma_instance.find_term, query=disease_name, code=False
)
if not mappings and disease_id and ":" in disease_id:
mappings = _simple_retry(ontoma_instance.find_term, query=disease_id, code=True)
return [m.id_ot_schema for m in mappings]


def add_efo_mapping(evidence_strings, spark_instance, ontoma_cache_dir=None, efo_version=None):
def add_efo_mapping(
evidence_strings: DataFrame,
spark_instance: SparkSession,
ontoma_cache_dir: Optional[str] = None,
efo_version: Optional[str] = None,
):
"""Given evidence strings with diseaseFromSource and diseaseFromSourceId fields, try to populate EFO mapping
field diseaseFromSourceMappedId. In case there are multiple matches, the evidence strings will be exploded
accordingly.

Currently, both source columns (diseaseFromSource and diseaseFromSourceId) need to be present in the original
schema, although they do not have to be populated for all rows."""
logging.info('Collect all distinct (disease name, disease ID) pairs.')
disease_info_to_map = evidence_strings.select('diseaseFromSource', 'diseaseFromSourceId').distinct().toPandas()
logger.info("Collect all distinct (disease name, disease ID) pairs.")
disease_info_to_map: pd.DataFrame = (
evidence_strings.select("diseaseFromSource", "diseaseFromSourceId")
.distinct()
.toPandas()
)

# If no EFO version is specified:
if not efo_version:
Expand All @@ -57,38 +79,50 @@ def add_efo_mapping(evidence_strings, spark_instance, ontoma_cache_dir=None, efo
efo_version = os.environ["EFO_VERSION"]
# Set default version to latest.
else:
logging.warning('No EFO version specified. Using latest version.')
efo_version = 'latest'
logger.warning("No EFO version specified. Using latest version.")
efo_version = "latest"

logging.info(f'Initialise OnToma instance. Using EFO version {efo_version}')
logger.info(f"Initialise OnToma instance. Using EFO version {efo_version}")
ontoma_instance = OnToma(cache_dir=ontoma_cache_dir, efo_release=efo_version)

logging.info('Map disease information to EFO.')
disease_info_to_map['diseaseFromSourceMappedId'] = disease_info_to_map.parallel_apply(
_ontoma_udf, args=(ontoma_instance,), axis=1
logger.info("Map disease information to EFO.")
disease_info_to_map = disease_info_to_map.assign(
diseaseFromSourceMappedId=lambda df: df.parallel_apply(
_ontoma_udf, args=(ontoma_instance,), axis=1
)
)
disease_info_to_map = (
disease_info_to_map.explode('diseaseFromSourceMappedId')
disease_info_to_map.explode("diseaseFromSourceMappedId")
# Cast all null values to python None to avoid errors in Spark's DF
.fillna(nan).replace([nan], [None])
.fillna(nan)
.replace([nan], [None])
)

logging.info('Join the resulting information into the evidence strings.')
logger.info("Join the resulting information into the evidence strings.")
schema = StructType(
[
StructField("diseaseFromSource_right", StringType(), True),
StructField("diseaseFromSourceId_right", StringType(), True),
StructField("diseaseFromSourceMappedId", StringType(), True),
]
)
disease_info_df = spark_instance.createDataFrame(disease_info_to_map, schema=schema).withColumn(
'diseaseFromSourceMappedId', when(col('diseaseFromSourceMappedId') != 'nan', col('diseaseFromSourceMappedId'))
disease_info_df = spark_instance.createDataFrame(
disease_info_to_map, schema=schema
).withColumn(
"diseaseFromSourceMappedId",
when(
col("diseaseFromSourceMappedId") != "nan", col("diseaseFromSourceMappedId")
),
)
# WARNING: Spark's join operator is not null safe by default and most of the times, `diseaseFromSourceId` will be null.
# `eqNullSafe` is a special null safe equality operator that is used to join the two dataframes.
join_cond = (evidence_strings.diseaseFromSource == disease_info_df.diseaseFromSource_right) & (
evidence_strings.diseaseFromSourceId.eqNullSafe(disease_info_df.diseaseFromSourceId_right)
join_cond = (
evidence_strings.diseaseFromSource == disease_info_df.diseaseFromSource_right
) & (
evidence_strings.diseaseFromSourceId.eqNullSafe(
disease_info_df.diseaseFromSourceId_right
)
)
return evidence_strings.join(disease_info_df, on=join_cond, how='left').drop(
'diseaseFromSource_right', 'diseaseFromSourceId_right'
return evidence_strings.join(disease_info_df, on=join_cond, how="left").drop(
"diseaseFromSource_right", "diseaseFromSourceId_right"
)
Loading