Merge branch 'issues/CSD-301-custom-types' into develop

MPI-IS · Dec 4, 2024 · 045a76c · 045a76c
2 parents 574522d + 509eecf
commit 045a76c
Show file tree

Hide file tree

Showing 28 changed files with 374 additions and 299 deletions.
diff --git a/probinet/evaluation/community_detection.py b/probinet/evaluation/community_detection.py
@@ -3,10 +3,41 @@
 """
 
 from contextlib import suppress
+from typing import Set
 
 import numpy as np
 
 
+def calculate_metric(ground_truth: Set[int], detected: Set[int], metric: str) -> float:
+    """
+    Calculate a metric for evaluating community detection.
+
+    Parameters
+    ----------
+    ground_truth : Set[int]
+        The set of ground truth nodes.
+    detected : Set[int]
+        The set of detected nodes.
+    metric : str
+        The metric to use for evaluation ('f1' or 'jaccard').
+
+    Returns
+    -------
+    float
+        The calculated metric value.
+    """
+    if not len(ground_truth.intersection(detected)):
+        return 0.0
+    precision = len(ground_truth.intersection(detected)) / len(detected)
+    recall = len(ground_truth.intersection(detected)) / len(ground_truth)
+    if metric == "f1":
+        return 2 * (precision * recall) / (precision + recall)
+    elif metric == "jaccard":
+        return len(ground_truth.intersection(detected)) / len(
+            ground_truth.union(detected)
+        )
+
+
 def compute_community_detection_metric(
     U_infer: np.ndarray, U0: np.ndarray, metric: str = "f1", com: bool = False
 ) -> float:
@@ -15,52 +46,29 @@ def compute_community_detection_metric(
     """
     if metric not in {"f1", "jaccard"}:
         raise ValueError('The similarity measure can be either "f1" or "jaccard"!')
+
     K = U0.shape[1]
-    gt = {}
+    threshold = 1 / K
+    # Create the ground truth dictionary for each community in the original partition. The key is
+    # the community index and the value is the set of nodes in that community, i.e., the nodes
+    # with a value greater than the threshold in the corresponding column of the original partition.
+    gt = {i: set(np.argwhere(U0[:, i] > threshold).flatten()) for i in range(K)}
     d = {}
-    threshold = 1 / U0.shape[1]
     for i in range(K):
-        gt[i] = list(np.argwhere(U0[:, i] > threshold).flatten())
         if com:
             with suppress(IndexError):
-                d[i] = U_infer[i]
+                d[i] = set(U_infer[i])
         else:
-            d[i] = list(np.argwhere(U_infer[:, i] > threshold).flatten())
-    R = 0
-    for i in np.arange(K):
-        ground_truth = set(gt[i])
-        _max = -1
-        M = 0
-        for j in d.keys():
-            detected = set(d[j])
-            if len(ground_truth & detected) != 0:
-                precision = len(ground_truth & detected) / len(detected)
-                recall = len(ground_truth & detected) / len(ground_truth)
-                if metric == "f1":
-                    M = 2 * (precision * recall) / (precision + recall)
-                elif metric == "jaccard":
-                    M = len(ground_truth & detected) / len(ground_truth.union(detected))
-            if M > _max:
-                _max = M
-        R += _max
-    S = 0
-    for j in d.keys():
-        detected = set(d[j])
-        _max = -1
-        M = 0
-        for i in np.arange(K):
-            ground_truth = set(gt[i])
-            if len(ground_truth & detected) != 0:
-                precision = len(ground_truth & detected) / len(detected)
-                recall = len(ground_truth & detected) / len(ground_truth)
-                if metric == "f1":
-                    M = 2 * (precision * recall) / (precision + recall)
-                elif metric == "jaccard":
-                    M = len(ground_truth & detected) / len(ground_truth.union(detected))
-            if M > _max:
-                _max = M
-        S += _max
-    return np.round(R / (2 * len(gt)) + S / (2 * len(d)), 4)
+            d[i] = set(np.argwhere(U_infer[:, i] > threshold).flatten())
+
+    R = sum(
+        max(calculate_metric(gt[i], d[j], metric) for j in d.keys()) for i in range(K)
+    )
+    S = sum(
+        max(calculate_metric(gt[i], d[j], metric) for i in range(K)) for j in d.keys()
+    )
+    # Return the average of the two measures
+    return np.round(R / (2 * K) + S / (2 * len(d)), 4)
 
 
 def compute_permutation_matrix(U_infer: np.ndarray, U0: np.ndarray) -> np.ndarray:

diff --git a/probinet/evaluation/expectation_computation.py b/probinet/evaluation/expectation_computation.py
@@ -2,15 +2,15 @@
 Functions for computing expectations and related metrics.
 """
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple
 
 import numpy as np
 import pandas as pd
 from scipy.stats import poisson
 from sklearn import metrics
-from sparse import COO
 
 from ..model_selection.labeling import extract_true_label, predict_label
+from ..types import GraphDataType
 from ..utils.matrix_operations import transpose_matrix, transpose_tensor
 from ..utils.tools import check_symmetric
 
@@ -32,7 +32,7 @@ def calculate_conditional_expectation(
 
 
 def calculate_conditional_expectation_dyncrep(
-    B_to_T: Union[COO, np.ndarray],
+    B_to_T: GraphDataType,
     u: np.ndarray,
     v: np.ndarray,
     w: np.ndarray,

diff --git a/probinet/evaluation/likelihood.py b/probinet/evaluation/likelihood.py
@@ -198,8 +198,7 @@ def likelihood_conditional(
     # Check for NaN values in the log-likelihood
     if np.isnan(l):
         log_and_raise_error(ValueError, "Likelihood is NaN!")
-    else:
-        return l
+    return l
 
 
 def PSloglikelihood(

diff --git a/probinet/evaluation/link_prediction.py b/probinet/evaluation/link_prediction.py
@@ -12,23 +12,18 @@
     compute_expected_adjacency_tensor_multilayer,
 )
 
+
 @singledispatch
-def mask_or_flatten_array(mask: Union[np.ndarray, None], expected_adjacency: np.ndarray) -> np.ndarray:
+def mask_or_flatten_array(
+    mask: Union[np.ndarray, None], expected_adjacency: np.ndarray
+) -> np.ndarray:
     raise NotImplementedError(f"Unsupported type {type(mask)} for mask.")
 
+
 @mask_or_flatten_array.register(type(None))
 def _(mask: None, expected_adjacency: np.ndarray) -> np.ndarray:
     return expected_adjacency.flatten()
 
-@mask_or_flatten_array.register(np.ndarray)
-def _(mask: np.ndarray, expected_adjacency: np.ndarray) -> np.ndarray:
-    return expected_adjacency[mask > 0]
-
-
-@singledispatch
-def mask_or_flatten_array(mask: None, expected_adjacency: np.ndarray) -> np.ndarray:
-    return expected_adjacency.flatten()
-
 
 @mask_or_flatten_array.register(np.ndarray)
 def _(mask: np.ndarray, expected_adjacency: np.ndarray) -> np.ndarray:
@@ -179,5 +174,6 @@ def calculate_f1_score(
     # Binarize the data
     data = (data0 > 0).astype("int")
 
-    return metrics.f1_score(mask_or_flatten_array(mask, data), mask_or_flatten_array(mask, Z_pred))
-
+    return metrics.f1_score(
+        mask_or_flatten_array(mask, data), mask_or_flatten_array(mask, Z_pred)
+    )
diff --git a/probinet/input/loader.py b/probinet/input/loader.py
@@ -11,7 +11,6 @@
 import networkx as nx
 import numpy as np
 import pandas as pd
-from numpy import ndarray
 
 from .preprocessing import (
     create_adjacency_tensor_from_graph_list,
@@ -34,44 +33,35 @@ def build_adjacency_from_file(
     **_kwargs: Any,
 ) -> GraphData:
     """
-    Import data, i.e. the adjacency matrix, from a given folder.
+     Import data, i.e. the adjacency matrix, from a given folder.
 
-    Return the NetworkX graph and its numpy adjacency matrix.
+     Return the NetworkX graph and its numpy adjacency matrix.
 
-    Parameters
-    ----------
-    path_to_file
-        Path of the input file.
-    ego
-        Name of the column to consider as the source of the edge.
-    alter
-        Name of the column to consider as the target of the edge.
-    force_dense
-        If set to True, the algorithm is forced to consider a dense adjacency tensor.
-    undirected
-        If set to True, the algorithm considers an undirected graph.
-    noselfloop
-        If set to True, the algorithm removes the self-loops.
-    sep
-        Separator to use when reading the dataset.
-    binary
-        If set to True, the algorithm reads the graph with binary edges.
-    header
-        Row number to use as the column names, and the start of the data.
-
-    Returns
-    -------
-    A
-        List of MultiDiGraph NetworkX objects representing the layers of the network.
-    B
-        Graph adjacency tensor. If `force_dense` is True, returns a dense ndarray. Otherwise, returns a sparse COO tensor.
-    B_T
-        Transposed graph adjacency tensor. Returns None if `force_dense` is True.
-    data_T_vals
-        Array with values of entries A[j, i] if entry A[i, j] is non-zero. Returns None if
-        `force_dense` is True.
-    nodes
-        List of node IDs
+     Parameters
+     ----------
+     path_to_file
+         Path of the input file.
+     ego
+         Name of the column to consider as the source of the edge.
+     alter
+         Name of the column to consider as the target of the edge.
+     force_dense
+         If set to True, the algorithm is forced to consider a dense adjacency tensor.
+     undirected
+         If set to True, the algorithm considers an undirected graph.
+     noselfloop
+         If set to True, the algorithm removes the self-loops.
+     sep
+         Separator to use when reading the dataset.
+     binary
+         If set to True, the algorithm reads the graph with binary edges.
+     header
+         Row number to use as the column names, and the start of the data.
+
+     Returns
+     -------
+    GraphData
+         Named tuple containing the graph list, the adjacency tensor, the transposed tensor, the data values, and the nodes.
     """
 
     # Read adjacency file

diff --git a/probinet/input/preprocessing.py b/probinet/input/preprocessing.py
@@ -12,6 +12,7 @@
 import scipy
 from sparse import COO
 
+from ..types import GraphDataType
 from ..utils import tools
 
 
@@ -162,7 +163,7 @@ def create_sparse_adjacency_tensor_from_graph_list(
     return data
 
 
-def preprocess_adjacency_tensor(A: np.ndarray) -> Union[COO, np.ndarray]:
+def preprocess_adjacency_tensor(A: np.ndarray) -> GraphDataType:
     """
     Pre-process input data tensor.
 

diff --git a/probinet/main.py b/probinet/main.py
@@ -29,6 +29,14 @@
 
 
 def parse_args():
+    """
+    Parse the command-line arguments.
+
+    Returns
+    -------
+    args : argparse.Namespace
+           Parsed arguments.
+    """
     parser = argparse.ArgumentParser(
         description="Script to run the CRep, JointCRep, DynCRep, MTCOV, and ACD algorithms.",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,

diff --git a/probinet/model_selection/cross_validation.py b/probinet/model_selection/cross_validation.py
@@ -32,6 +32,10 @@
 
 
 class CrossValidation(ABC):
+    """
+    Abstract class to implement cross-validation for a given algorithm.
+    """
+
     def __init__(
         self, algorithm, model_parameters, cv_parameters, numerical_parameters=None
     ):
@@ -44,6 +48,9 @@ def __init__(
             setattr(self, key, value)
 
     def prepare_output_directory(self):
+        """
+        Prepare the output directory to save the results.
+        """
         if not os.path.exists(self.out_folder):
             os.makedirs(self.out_folder)
 
@@ -101,7 +108,23 @@ def load_data(self):
             sep=self.sep,
         )
 
-    def prepare_and_run(self, mask):
+    def prepare_and_run(self, mask: np.ndarray):
+        """
+        Prepare the data for training and run the algorithm.
+
+        Parameters
+        ----------
+        mask: np.ndarray
+            The mask to apply on the data.
+
+        Returns
+        -------
+        tuple
+            The outputs of the algorithm.
+        object
+            The algorithm object.
+
+        """
         # Create a copy of the adjacency matrix B to use for training
         B_train = self.gdata.adjacency_tensor.copy()
 

diff --git a/probinet/model_selection/main.py b/probinet/model_selection/main.py
@@ -3,6 +3,7 @@
 """
 
 import logging
+from typing import Any, Optional
 
 import pandas as pd
 
@@ -15,9 +16,29 @@
 
 
 def cross_validation(
-    algorithm, model_parameters, cv_parameters, numerical_parameters=None
-):
-
+    algorithm: str,
+    model_parameters: dict[str, Any],
+    cv_parameters: dict[str, Any],
+    numerical_parameters: Optional[dict[str, Any]] = None,
+) -> pd.DataFrame:
+    """
+    Run cross-validation for a given algorithm.
+    Parameters
+    ----------
+    algorithm
+        String with the name of the algorithm to run.
+    model_parameters
+        Dictionary with the parameters for the algorithm.
+    cv_parameters
+        Dictionary with the parameters for the cross-validation.
+    numerical_parameters
+        Dictionary with the numerical parameters for the algorithm, like the number of iterations, etc.
+
+    Returns
+    -------
+    results_df
+        DataFrame with the results of the cross-validation.
+    """
     if numerical_parameters is None:
         numerical_parameters = {}
     cv_classes = {