opentargets · DSuveges · Sep 11, 2024 · Sep 11, 2024 · Sep 13, 2024 · Sep 16, 2024
diff --git a/common/evidence.py b/common/evidence.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import json
-import logging
+import logging.config
 import os
 import sys
 import tempfile
@@ -21,38 +21,40 @@
     from pyspark.sql import Column, DataFrame
 
 
-def initialize_logger(
-    name: str, log_file: Optional[str] = None, log_level: int = logging.INFO
-) -> None:
+def initialize_logger(name: str, log_file: Optional[str] = None) -> None:
     """Initialize the logger.
 
     Args:
         name (str): Name of the logger. This is typically the name of the module. Required to identify the logger.
         log_file (str): Path to the log file.
-        log_level (int): log level eg. logging.INFO, logging.ERROR
 
     Returns:
         None
     """
-    # Setting the format of the log messages:
-    log_formatter = logging.Formatter(
-        "%(asctime)s %(levelname)s %(module)s - %(funcName)s: %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S",
-    )
+    # Initialise logger:
+    with open(f"{sys.path[0]}/../logger_config.yaml", "r") as stream:
+        logger_config = yaml.load(stream, Loader=yaml.FullLoader)
+
+    logging.config.dictConfig(logger_config)
+    # # Setting the format of the log messages:
+    # log_formatter = logging.Formatter(
+    #     "%(asctime)s %(levelname)s %(module)s - %(funcName)s: %(message)s",
+    #     datefmt="%Y-%m-%d %H:%M:%S",
+    # )
 
-    logger = logging.getLogger(name)
-    logger.setLevel(log_level)
+    # logger = logging.getLogger(name)
+    # logger.setLevel(logging.DEBUG)
 
-    # Setting up stream handler:
-    stream_handler = logging.StreamHandler(sys.stderr)
-    stream_handler.setFormatter(log_formatter)
-    logger.addHandler(stream_handler)
+    # # Setting up stream handler:
+    # stream_handler = logging.StreamHandler(sys.stderr)
+    # stream_handler.setFormatter(log_formatter)
+    # logger.addHandler(stream_handler)
 
-    # If a log file is provided, add that handler too:
-    if log_file is not None:
-        file_handler = logging.FileHandler(log_file, mode="w")
-        file_handler.setFormatter(log_formatter)
-        logger.addHandler(file_handler)
+    # # If a log file is provided, add that handler too:
+    # if log_file is not None:
+    #     file_handler = logging.FileHandler(log_file, mode="w")
+    #     file_handler.setFormatter(log_formatter)
+    #     logger.addHandler(file_handler)
 
 
 def detect_spark_memory_limit():
@@ -93,11 +95,7 @@ def initialize_sparksession() -> SparkSession:
         .set("spark.sql.execution.arrow.maxRecordsPerBatch", "500000")
         .set("spark.ui.showConsoleProgress", "false")
     )
-    return (
-        SparkSession.builder.config(conf=spark_conf)
-        .master("local[*]")
-        .getOrCreate()
-    )
+    return SparkSession.builder.config(conf=spark_conf).master("local[*]").getOrCreate()
 
 
 class GenerateDiseaseCellLines:
@@ -346,13 +344,14 @@ def read_ppp_config(config_path: str) -> dict:
 
     return parameters
 
+
 def apply_bonferroni_correction(n_tests: int) -> float:
     """Multiple test correction based on the number of tests.
-    
+
     Args:
         n_tests (int): Number of hypotheses testes assuming they are independent
 
     Returns:
         float: new statistical significance level
     """
-    return 0.05 / n_tests
+    return 0.05 / n_tests
diff --git a/common/ontology.py b/common/ontology.py
@@ -2,13 +2,18 @@
 import os
 import random
 import time
+from typing import Optional
 
+import pandas as pd
 from numpy import nan
 from ontoma.interface import OnToma
 from pandarallel import pandarallel
+from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.functions import col, when
 from pyspark.sql.types import StringType, StructField, StructType
 
+logger = logging.getLogger(__name__)
+
 ONTOMA_MAX_ATTEMPTS = 3
 pandarallel.initialize()
 
@@ -22,33 +27,50 @@ def _simple_retry(func, **kwargs):
             # If this is not the last attempt, wait until the next one.
             if attempt != ONTOMA_MAX_ATTEMPTS:
                 time.sleep(5 + 10 * random.random())
-    logging.error(f'OnToma lookup failed for {kwargs!r}')
+    logging.error(f"OnToma lookup failed for {kwargs!r}")
     return []
 
 
 def _ontoma_udf(row, ontoma_instance):
     """Try to map first by disease name (because that branch of OnToma is more stable), then by disease ID."""
     disease_name = None
-    if row['diseaseFromSource']:
-        disease_name = ' '.join(row['diseaseFromSource'].replace('obsolete', '').split())
-    disease_id = row['diseaseFromSourceId'].replace('_', ':') if row['diseaseFromSourceId'] else None
+    if row["diseaseFromSource"]:
+        disease_name = " ".join(
+            row["diseaseFromSource"].replace("obsolete", "").split()
+        )
+    disease_id = (
+        row["diseaseFromSourceId"].replace("_", ":")
+        if row["diseaseFromSourceId"]
+        else None
+    )
     mappings = []
     if disease_name:
-        mappings = _simple_retry(ontoma_instance.find_term, query=disease_name, code=False)
-    if not mappings and disease_id and ':' in disease_id:
+        mappings = _simple_retry(
+            ontoma_instance.find_term, query=disease_name, code=False
+        )
+    if not mappings and disease_id and ":" in disease_id:
         mappings = _simple_retry(ontoma_instance.find_term, query=disease_id, code=True)
     return [m.id_ot_schema for m in mappings]
 
 
-def add_efo_mapping(evidence_strings, spark_instance, ontoma_cache_dir=None, efo_version=None):
+def add_efo_mapping(
+    evidence_strings: DataFrame,
+    spark_instance: SparkSession,
+    ontoma_cache_dir: Optional[str] = None,
+    efo_version: Optional[str] = None,
+):
     """Given evidence strings with diseaseFromSource and diseaseFromSourceId fields, try to populate EFO mapping
     field diseaseFromSourceMappedId. In case there are multiple matches, the evidence strings will be exploded
     accordingly.
 
     Currently, both source columns (diseaseFromSource and diseaseFromSourceId) need to be present in the original
     schema, although they do not have to be populated for all rows."""
-    logging.info('Collect all distinct (disease name, disease ID) pairs.')
-    disease_info_to_map = evidence_strings.select('diseaseFromSource', 'diseaseFromSourceId').distinct().toPandas()
+    logger.info("Collect all distinct (disease name, disease ID) pairs.")
+    disease_info_to_map: pd.DataFrame = (
+        evidence_strings.select("diseaseFromSource", "diseaseFromSourceId")
+        .distinct()
+        .toPandas()
+    )
 
     # If no EFO version is specified:
     if not efo_version:
@@ -57,38 +79,50 @@ def add_efo_mapping(evidence_strings, spark_instance, ontoma_cache_dir=None, efo
             efo_version = os.environ["EFO_VERSION"]
         # Set default version to latest.
         else:
-            logging.warning('No EFO version specified. Using latest version.')
-            efo_version = 'latest'
+            logger.warning("No EFO version specified. Using latest version.")
+            efo_version = "latest"
 
-    logging.info(f'Initialise OnToma instance. Using EFO version {efo_version}')
+    logger.info(f"Initialise OnToma instance. Using EFO version {efo_version}")
     ontoma_instance = OnToma(cache_dir=ontoma_cache_dir, efo_release=efo_version)
 
-    logging.info('Map disease information to EFO.')
-    disease_info_to_map['diseaseFromSourceMappedId'] = disease_info_to_map.parallel_apply(
-        _ontoma_udf, args=(ontoma_instance,), axis=1
+    logger.info("Map disease information to EFO.")
+    disease_info_to_map = disease_info_to_map.assign(
+        diseaseFromSourceMappedId=lambda df: df.parallel_apply(
+            _ontoma_udf, args=(ontoma_instance,), axis=1
+        )
     )
     disease_info_to_map = (
-        disease_info_to_map.explode('diseaseFromSourceMappedId')
+        disease_info_to_map.explode("diseaseFromSourceMappedId")
         # Cast all null values to python None to avoid errors in Spark's DF
-        .fillna(nan).replace([nan], [None])
+        .fillna(nan)
+        .replace([nan], [None])
     )
 
-    logging.info('Join the resulting information into the evidence strings.')
+    logger.info("Join the resulting information into the evidence strings.")
     schema = StructType(
         [
             StructField("diseaseFromSource_right", StringType(), True),
             StructField("diseaseFromSourceId_right", StringType(), True),
             StructField("diseaseFromSourceMappedId", StringType(), True),
         ]
     )
-    disease_info_df = spark_instance.createDataFrame(disease_info_to_map, schema=schema).withColumn(
-        'diseaseFromSourceMappedId', when(col('diseaseFromSourceMappedId') != 'nan', col('diseaseFromSourceMappedId'))
+    disease_info_df = spark_instance.createDataFrame(
+        disease_info_to_map, schema=schema
+    ).withColumn(
+        "diseaseFromSourceMappedId",
+        when(
+            col("diseaseFromSourceMappedId") != "nan", col("diseaseFromSourceMappedId")
+        ),
     )
     # WARNING: Spark's join operator is not null safe by default and most of the times, `diseaseFromSourceId` will be null.
     # `eqNullSafe` is a special null safe equality operator that is used to join the two dataframes.
-    join_cond = (evidence_strings.diseaseFromSource == disease_info_df.diseaseFromSource_right) & (
-        evidence_strings.diseaseFromSourceId.eqNullSafe(disease_info_df.diseaseFromSourceId_right)
+    join_cond = (
+        evidence_strings.diseaseFromSource == disease_info_df.diseaseFromSource_right
+    ) & (
+        evidence_strings.diseaseFromSourceId.eqNullSafe(
+            disease_info_df.diseaseFromSourceId_right
+        )
     )
-    return evidence_strings.join(disease_info_df, on=join_cond, how='left').drop(
-        'diseaseFromSource_right', 'diseaseFromSourceId_right'
+    return evidence_strings.join(disease_info_df, on=join_cond, how="left").drop(
+        "diseaseFromSource_right", "diseaseFromSourceId_right"
     )