diff --git a/probinet/evaluation/community_detection.py b/probinet/evaluation/community_detection.py index 98cb69b..3c15b1c 100644 --- a/probinet/evaluation/community_detection.py +++ b/probinet/evaluation/community_detection.py @@ -3,10 +3,41 @@ """ from contextlib import suppress +from typing import Set import numpy as np +def calculate_metric(ground_truth: Set[int], detected: Set[int], metric: str) -> float: + """ + Calculate a metric for evaluating community detection. + + Parameters + ---------- + ground_truth : Set[int] + The set of ground truth nodes. + detected : Set[int] + The set of detected nodes. + metric : str + The metric to use for evaluation ('f1' or 'jaccard'). + + Returns + ------- + float + The calculated metric value. + """ + if not len(ground_truth.intersection(detected)): + return 0.0 + precision = len(ground_truth.intersection(detected)) / len(detected) + recall = len(ground_truth.intersection(detected)) / len(ground_truth) + if metric == "f1": + return 2 * (precision * recall) / (precision + recall) + elif metric == "jaccard": + return len(ground_truth.intersection(detected)) / len( + ground_truth.union(detected) + ) + + def compute_community_detection_metric( U_infer: np.ndarray, U0: np.ndarray, metric: str = "f1", com: bool = False ) -> float: @@ -15,52 +46,29 @@ def compute_community_detection_metric( """ if metric not in {"f1", "jaccard"}: raise ValueError('The similarity measure can be either "f1" or "jaccard"!') + K = U0.shape[1] - gt = {} + threshold = 1 / K + # Create the ground truth dictionary for each community in the original partition. The key is + # the community index and the value is the set of nodes in that community, i.e., the nodes + # with a value greater than the threshold in the corresponding column of the original partition. + gt = {i: set(np.argwhere(U0[:, i] > threshold).flatten()) for i in range(K)} d = {} - threshold = 1 / U0.shape[1] for i in range(K): - gt[i] = list(np.argwhere(U0[:, i] > threshold).flatten()) if com: with suppress(IndexError): - d[i] = U_infer[i] + d[i] = set(U_infer[i]) else: - d[i] = list(np.argwhere(U_infer[:, i] > threshold).flatten()) - R = 0 - for i in np.arange(K): - ground_truth = set(gt[i]) - _max = -1 - M = 0 - for j in d.keys(): - detected = set(d[j]) - if len(ground_truth & detected) != 0: - precision = len(ground_truth & detected) / len(detected) - recall = len(ground_truth & detected) / len(ground_truth) - if metric == "f1": - M = 2 * (precision * recall) / (precision + recall) - elif metric == "jaccard": - M = len(ground_truth & detected) / len(ground_truth.union(detected)) - if M > _max: - _max = M - R += _max - S = 0 - for j in d.keys(): - detected = set(d[j]) - _max = -1 - M = 0 - for i in np.arange(K): - ground_truth = set(gt[i]) - if len(ground_truth & detected) != 0: - precision = len(ground_truth & detected) / len(detected) - recall = len(ground_truth & detected) / len(ground_truth) - if metric == "f1": - M = 2 * (precision * recall) / (precision + recall) - elif metric == "jaccard": - M = len(ground_truth & detected) / len(ground_truth.union(detected)) - if M > _max: - _max = M - S += _max - return np.round(R / (2 * len(gt)) + S / (2 * len(d)), 4) + d[i] = set(np.argwhere(U_infer[:, i] > threshold).flatten()) + + R = sum( + max(calculate_metric(gt[i], d[j], metric) for j in d.keys()) for i in range(K) + ) + S = sum( + max(calculate_metric(gt[i], d[j], metric) for i in range(K)) for j in d.keys() + ) + # Return the average of the two measures + return np.round(R / (2 * K) + S / (2 * len(d)), 4) def compute_permutation_matrix(U_infer: np.ndarray, U0: np.ndarray) -> np.ndarray: diff --git a/probinet/evaluation/expectation_computation.py b/probinet/evaluation/expectation_computation.py index 55ca358..8d84a09 100644 --- a/probinet/evaluation/expectation_computation.py +++ b/probinet/evaluation/expectation_computation.py @@ -2,15 +2,15 @@ Functions for computing expectations and related metrics. """ -from typing import Optional, Tuple, Union +from typing import Optional, Tuple import numpy as np import pandas as pd from scipy.stats import poisson from sklearn import metrics -from sparse import COO from ..model_selection.labeling import extract_true_label, predict_label +from ..types import GraphDataType from ..utils.matrix_operations import transpose_matrix, transpose_tensor from ..utils.tools import check_symmetric @@ -32,7 +32,7 @@ def calculate_conditional_expectation( def calculate_conditional_expectation_dyncrep( - B_to_T: Union[COO, np.ndarray], + B_to_T: GraphDataType, u: np.ndarray, v: np.ndarray, w: np.ndarray, diff --git a/probinet/evaluation/likelihood.py b/probinet/evaluation/likelihood.py index f61b472..88c1d3b 100644 --- a/probinet/evaluation/likelihood.py +++ b/probinet/evaluation/likelihood.py @@ -198,8 +198,7 @@ def likelihood_conditional( # Check for NaN values in the log-likelihood if np.isnan(l): log_and_raise_error(ValueError, "Likelihood is NaN!") - else: - return l + return l def PSloglikelihood( diff --git a/probinet/evaluation/link_prediction.py b/probinet/evaluation/link_prediction.py index d0fd4d7..7efc33c 100644 --- a/probinet/evaluation/link_prediction.py +++ b/probinet/evaluation/link_prediction.py @@ -12,23 +12,18 @@ compute_expected_adjacency_tensor_multilayer, ) + @singledispatch -def mask_or_flatten_array(mask: Union[np.ndarray, None], expected_adjacency: np.ndarray) -> np.ndarray: +def mask_or_flatten_array( + mask: Union[np.ndarray, None], expected_adjacency: np.ndarray +) -> np.ndarray: raise NotImplementedError(f"Unsupported type {type(mask)} for mask.") + @mask_or_flatten_array.register(type(None)) def _(mask: None, expected_adjacency: np.ndarray) -> np.ndarray: return expected_adjacency.flatten() -@mask_or_flatten_array.register(np.ndarray) -def _(mask: np.ndarray, expected_adjacency: np.ndarray) -> np.ndarray: - return expected_adjacency[mask > 0] - - -@singledispatch -def mask_or_flatten_array(mask: None, expected_adjacency: np.ndarray) -> np.ndarray: - return expected_adjacency.flatten() - @mask_or_flatten_array.register(np.ndarray) def _(mask: np.ndarray, expected_adjacency: np.ndarray) -> np.ndarray: @@ -179,5 +174,6 @@ def calculate_f1_score( # Binarize the data data = (data0 > 0).astype("int") - return metrics.f1_score(mask_or_flatten_array(mask, data), mask_or_flatten_array(mask, Z_pred)) - + return metrics.f1_score( + mask_or_flatten_array(mask, data), mask_or_flatten_array(mask, Z_pred) + ) diff --git a/probinet/input/loader.py b/probinet/input/loader.py index e654710..8ca21fb 100644 --- a/probinet/input/loader.py +++ b/probinet/input/loader.py @@ -11,7 +11,6 @@ import networkx as nx import numpy as np import pandas as pd -from numpy import ndarray from .preprocessing import ( create_adjacency_tensor_from_graph_list, @@ -34,44 +33,35 @@ def build_adjacency_from_file( **_kwargs: Any, ) -> GraphData: """ - Import data, i.e. the adjacency matrix, from a given folder. + Import data, i.e. the adjacency matrix, from a given folder. - Return the NetworkX graph and its numpy adjacency matrix. + Return the NetworkX graph and its numpy adjacency matrix. - Parameters - ---------- - path_to_file - Path of the input file. - ego - Name of the column to consider as the source of the edge. - alter - Name of the column to consider as the target of the edge. - force_dense - If set to True, the algorithm is forced to consider a dense adjacency tensor. - undirected - If set to True, the algorithm considers an undirected graph. - noselfloop - If set to True, the algorithm removes the self-loops. - sep - Separator to use when reading the dataset. - binary - If set to True, the algorithm reads the graph with binary edges. - header - Row number to use as the column names, and the start of the data. - - Returns - ------- - A - List of MultiDiGraph NetworkX objects representing the layers of the network. - B - Graph adjacency tensor. If `force_dense` is True, returns a dense ndarray. Otherwise, returns a sparse COO tensor. - B_T - Transposed graph adjacency tensor. Returns None if `force_dense` is True. - data_T_vals - Array with values of entries A[j, i] if entry A[i, j] is non-zero. Returns None if - `force_dense` is True. - nodes - List of node IDs + Parameters + ---------- + path_to_file + Path of the input file. + ego + Name of the column to consider as the source of the edge. + alter + Name of the column to consider as the target of the edge. + force_dense + If set to True, the algorithm is forced to consider a dense adjacency tensor. + undirected + If set to True, the algorithm considers an undirected graph. + noselfloop + If set to True, the algorithm removes the self-loops. + sep + Separator to use when reading the dataset. + binary + If set to True, the algorithm reads the graph with binary edges. + header + Row number to use as the column names, and the start of the data. + + Returns + ------- + GraphData + Named tuple containing the graph list, the adjacency tensor, the transposed tensor, the data values, and the nodes. """ # Read adjacency file diff --git a/probinet/input/preprocessing.py b/probinet/input/preprocessing.py index ecc2f52..3700ce3 100644 --- a/probinet/input/preprocessing.py +++ b/probinet/input/preprocessing.py @@ -12,6 +12,7 @@ import scipy from sparse import COO +from ..types import GraphDataType from ..utils import tools @@ -162,7 +163,7 @@ def create_sparse_adjacency_tensor_from_graph_list( return data -def preprocess_adjacency_tensor(A: np.ndarray) -> Union[COO, np.ndarray]: +def preprocess_adjacency_tensor(A: np.ndarray) -> GraphDataType: """ Pre-process input data tensor. diff --git a/probinet/main.py b/probinet/main.py index 1af1f9a..1315506 100644 --- a/probinet/main.py +++ b/probinet/main.py @@ -29,6 +29,14 @@ def parse_args(): + """ + Parse the command-line arguments. + + Returns + ------- + args : argparse.Namespace + Parsed arguments. + """ parser = argparse.ArgumentParser( description="Script to run the CRep, JointCRep, DynCRep, MTCOV, and ACD algorithms.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, diff --git a/probinet/model_selection/cross_validation.py b/probinet/model_selection/cross_validation.py index 8128bfe..d3176ca 100644 --- a/probinet/model_selection/cross_validation.py +++ b/probinet/model_selection/cross_validation.py @@ -32,6 +32,10 @@ class CrossValidation(ABC): + """ + Abstract class to implement cross-validation for a given algorithm. + """ + def __init__( self, algorithm, model_parameters, cv_parameters, numerical_parameters=None ): @@ -44,6 +48,9 @@ def __init__( setattr(self, key, value) def prepare_output_directory(self): + """ + Prepare the output directory to save the results. + """ if not os.path.exists(self.out_folder): os.makedirs(self.out_folder) @@ -101,7 +108,23 @@ def load_data(self): sep=self.sep, ) - def prepare_and_run(self, mask): + def prepare_and_run(self, mask: np.ndarray): + """ + Prepare the data for training and run the algorithm. + + Parameters + ---------- + mask: np.ndarray + The mask to apply on the data. + + Returns + ------- + tuple + The outputs of the algorithm. + object + The algorithm object. + + """ # Create a copy of the adjacency matrix B to use for training B_train = self.gdata.adjacency_tensor.copy() diff --git a/probinet/model_selection/main.py b/probinet/model_selection/main.py index 0ae7027..62bcffd 100644 --- a/probinet/model_selection/main.py +++ b/probinet/model_selection/main.py @@ -3,6 +3,7 @@ """ import logging +from typing import Any, Optional import pandas as pd @@ -15,9 +16,29 @@ def cross_validation( - algorithm, model_parameters, cv_parameters, numerical_parameters=None -): - + algorithm: str, + model_parameters: dict[str, Any], + cv_parameters: dict[str, Any], + numerical_parameters: Optional[dict[str, Any]] = None, +) -> pd.DataFrame: + """ + Run cross-validation for a given algorithm. + Parameters + ---------- + algorithm + String with the name of the algorithm to run. + model_parameters + Dictionary with the parameters for the algorithm. + cv_parameters + Dictionary with the parameters for the cross-validation. + numerical_parameters + Dictionary with the numerical parameters for the algorithm, like the number of iterations, etc. + + Returns + ------- + results_df + DataFrame with the results of the cross-validation. + """ if numerical_parameters is None: numerical_parameters = {} cv_classes = { diff --git a/probinet/model_selection/masking.py b/probinet/model_selection/masking.py index 086c20f..90cd62e 100644 --- a/probinet/model_selection/masking.py +++ b/probinet/model_selection/masking.py @@ -2,7 +2,7 @@ This module provides functions for shuffling indices and extracting masks for selecting the held-out set in the adjacency tensor and design matrix. """ -from typing import List, Optional, Tuple +from typing import List, Tuple, Sequence import numpy as np @@ -67,8 +67,8 @@ def shuffle_indicesG(N: int, L: int, rseed: int = 10) -> List[List[Tuple[int, in idxG = [[(i, j) for i in range(N) for j in range(N)] for _ in range(L)] # Shuffle indices for each layer - for l in range(L): - rng.shuffle(idxG[l]) + for layer in range(L): + rng.shuffle(idxG[layer]) return idxG @@ -102,8 +102,8 @@ def shuffle_indicesX(N: int, rseed: int = 10) -> np.ndarray: def extract_masks( N: int, L: int, - idxG: Optional[List[List[Tuple[int, int]]]] = None, - idxX: Optional[List[int]] = None, + idxG: list[list[Tuple[int, int]]], + idxX: Sequence[int], cv_type: str = "kfold", NFold: int = 5, fold: int = 0, diff --git a/probinet/model_selection/mtcov_cross_validation.py b/probinet/model_selection/mtcov_cross_validation.py index 2a1f47d..f7bd25c 100644 --- a/probinet/model_selection/mtcov_cross_validation.py +++ b/probinet/model_selection/mtcov_cross_validation.py @@ -127,7 +127,7 @@ def prepare_and_run(self, masks): return outputs, algorithm_object def calculate_performance_and_prepare_comparison( - self, outputs, masks, fold, algorithm_object + self, outputs, masks, fold, _algorithm_object ): maskG, maskX = masks diff --git a/probinet/models/acd.py b/probinet/models/acd.py index e6b9c03..785569f 100644 --- a/probinet/models/acd.py +++ b/probinet/models/acd.py @@ -6,9 +6,8 @@ import logging import time -from os import PathLike from pathlib import Path -from typing import Any, List, Optional, Tuple, Union +from typing import Any, List, Tuple, Optional import numpy as np import pandas as pd @@ -21,9 +20,16 @@ ) from .base import ModelBase, ModelUpdateMixin from .classes import GraphData -from .constants import EPS_ +from .constants import EPS_, OUTPUT_FOLDER, BG_DEFAULT, K_DEFAULT, AG_DEFAULT from ..input.preprocessing import preprocess_adjacency_tensor -from ..types import GraphDataType +from ..types import ( + GraphDataType, + MaskType, + EndFileType, + FilesType, + ArraySequence, + SubsNzType, +) from ..utils.matrix_operations import ( sp_uttkrp, sp_uttkrp_assortative, @@ -106,26 +112,26 @@ def _check_fit_params(self, **kwargs) -> None: def fit( self, gdata: GraphData, - ag: float = 1.5, - bg: float = 10.0, + ag: float = AG_DEFAULT, + bg: float = BG_DEFAULT, pibr0: Optional[float] = None, mupr0: Optional[float] = None, flag_anomaly: bool = True, fix_pibr: bool = False, fix_mupr: bool = False, - K: int = 3, + K: int = K_DEFAULT, undirected: bool = False, initialization: int = 0, assortative: bool = True, constrained: bool = False, fix_w: bool = False, fix_communities: bool = False, - mask: Optional[np.ndarray] = None, + mask: Optional[MaskType] = None, rseed: int = 10, out_inference: bool = True, - out_folder: Path = Path("outputs"), - end_file: Optional[str] = None, - files: Optional[PathLike] = None, + out_folder: Path = OUTPUT_FOLDER, + end_file: Optional[EndFileType] = None, + files: Optional[FilesType] = None, **__kwargs: Any, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, float, float, float]: """ @@ -133,7 +139,7 @@ def fit( Parameters ---------- - data : Union[COO, np.ndarray] + data : GraphDataType Graph adjacency tensor. nodes : List[int] List of node IDs. @@ -141,9 +147,9 @@ def fit( Shape of gamma prior, by default 1.5. bg : float, optional Rate of gamma prior, by default 10.0. - pibr0 : Optional[float], optional + pibr0 : float Initial value for the anomaly parameter pi, by default None. - mupr0 : Optional[float], optional + mupr0 : float Initial value for the prior mu parameter, by default None. flag_anomaly : bool, optional If True, the anomaly detection is enabled, by default True. @@ -165,7 +171,7 @@ def fit( If True, the affinity tensor w is fixed, by default False. fix_communities : bool, optional If True, the community memberships are fixed, by default False. - mask : Optional[np.ndarray], optional + mask : MaskType, optional Mask for selecting the held-out set in the adjacency tensor in case of cross-validation, by default None. rseed : int, optional Random seed for initialization, by default 10. @@ -309,14 +315,12 @@ def _initialize_realization(self) -> Tuple[int, bool, int, float, List[float]]: # Return the initial state of the realization return coincide, convergence, it, loglik, loglik_values - def _preprocess_data_for_fit( - self, data: GraphDataType, mask: Optional[np.ndarray] - ) -> Tuple[ + def _preprocess_data_for_fit(self, data: GraphDataType, mask: MaskType) -> Tuple[ GraphDataType, np.ndarray, np.ndarray, - Tuple[int, int, int], - Optional[Tuple[int, int, int]], + tuple[int, int, int], + tuple[int, int, int], ]: logging.debug("Preprocessing the data for fitting the models.") logging.debug("Data looks like: %s", data) @@ -425,7 +429,7 @@ def _update_cache( Parameters ---------- - data : Union[COO, np.ndarray] + data : GraphDataType Graph adjacency tensor. data_T_vals : ndarray Array with values of entries A[j, i] given non-zero entry (i, j). @@ -460,7 +464,7 @@ def _QIJ( self, data: GraphDataType, data_T_vals: np.ndarray, - subs_nz: Tuple[int, int, int], + subs_nz: SubsNzType, ) -> Tuple[np.ndarray, np.ndarray]: """ Compute the mean lambda0_ij for only non-zero entries. @@ -586,16 +590,16 @@ def _update_em( def _update_pibr( self, data: GraphDataType, - subs_nz: Tuple[int, int, int], - mask: Optional[np.ndarray] = None, - subs_nz_mask: Optional[Tuple[int, int, int]] = None, + subs_nz: SubsNzType, + mask: Optional[MaskType] = None, + subs_nz_mask: Optional[SubsNzType] = None, ) -> float: """ Update the anomaly parameter pi. Parameters ---------- - data : Union[COO, np.ndarray] + data : GraphDataType Graph adjacency tensor. subs_nz : tuple Indices of elements of data that are non-zero. @@ -625,8 +629,8 @@ def _update_pibr( def _update_mupr( self, - mask: Optional[np.ndarray] = None, - subs_nz_mask: Optional[Tuple[int, int, int]] = None, + mask: Optional[MaskType] = None, + subs_nz_mask: Optional[SubsNzType] = None, ) -> float: """ Update the prior mu parameter. @@ -751,9 +755,7 @@ def _specific_update_V(self): row_sums = self.v.sum(axis=1) self.v[row_sums > 0] /= row_sums[row_sums > 0, np.newaxis] - def _specific_update_W( - self, subs_nz: Tuple[int, int, int], mask: Optional[np.ndarray] = None - ): + def _specific_update_W(self, subs_nz: SubsNzType, mask: MaskType = None): """ Update affinity tensor. @@ -791,7 +793,7 @@ def _specific_update_W( self.w[Z == 0] = 0.0 self.w[non_zeros] /= Z[non_zeros] - def _update_W(self, subs_nz: Tuple[int, int, int]) -> float: + def _update_W(self, subs_nz: SubsNzType) -> float: # a generic function here that will do what each class needs self._specific_update_W(subs_nz) @@ -849,7 +851,7 @@ def _specific_update_W_assortative(self): def _update_membership( self, - subs_nz: Tuple[np.ndarray], + subs_nz: ArraySequence, u: np.ndarray, v: np.ndarray, w: np.ndarray, @@ -894,15 +896,15 @@ def compute_likelihood(self): def _ELBO( self, data: GraphDataType, - mask: Optional[np.ndarray] = None, - subs_nz_mask: Optional[Tuple[int, int, int]] = None, + mask: Optional[MaskType] = None, + subs_nz_mask: Optional[SubsNzType] = None, ) -> float: """ Compute the Evidence Lower BOund (ELBO) of the data. Parameters ---------- - data : Union[COO, np.ndarray] + data : GraphDataType Graph adjacency tensor. mask : ndarray, optional Mask for selecting the held out set in the adjacency tensor in case of cross-validation. @@ -935,7 +937,6 @@ def _ELBO( l = 0.0 # Term containing Q, pi and A - l -= self.pibr * self.Qij_dense.sum() if self.pibr >= 0: @@ -953,7 +954,6 @@ def _ELBO( ) # Entropy of Bernoulli in Q - if mask is None: non_zeros = self.Qij_dense > 0 non_zeros1 = (1 - self.Qij_dense) > 0 @@ -970,7 +970,6 @@ def _ELBO( ).sum() # Term containing Q, M and A - if mask is None: l -= ((1 - self.Qij_dense) * self.lambda0_ija).sum() coords_tuple = tuple(data.coords[i] for i in range(3)) @@ -981,7 +980,6 @@ def _ELBO( ).sum() # Term containing Q and mu - if 1 - self.mupr >= 0: l += np.log(1 - self.mupr + EPS_) * (1 - self.Qij_dense).sum() if self.mupr >= 0: diff --git a/probinet/models/base.py b/probinet/models/base.py index 48af132..5768de1 100644 --- a/probinet/models/base.py +++ b/probinet/models/base.py @@ -10,7 +10,7 @@ from argparse import Namespace from functools import singledispatchmethod from pathlib import Path -from typing import Any, Dict, Optional, Tuple, Union +from typing import Any, Dict, Optional, Tuple import numpy as np from sparse import COO @@ -18,6 +18,7 @@ from probinet.input.loader import build_adjacency_from_file from probinet.models.classes import GraphData from probinet.models.constants import CONVERGENCE_TOL_, DECISION_, ERR_, ERR_MAX_, INF_ +from probinet.types import GraphDataType from probinet.utils.tools import log_and_raise_error from probinet.visualization.plot import plot_L @@ -67,7 +68,6 @@ class ModelBase(ModelBaseParameters): convergence. All the models classes should inherit from this class. """ - def __init__(self, *args, **kwargs): # Call the __init__ method of the parent class super().__init__(*args, **kwargs) @@ -113,7 +113,7 @@ def check_params_to_load_data(self, binary, noselfloop, undirected, **kwargs): """ pass - def _validate_eta0(self, eta0: Optional[float]) -> None: + def _validate_eta0(self, eta0: float) -> None: if eta0 is not None and eta0 <= 0.0: message = "If not None, the eta0 parameter has to be greater than 0.!" log_and_raise_error(ValueError, message) @@ -521,7 +521,7 @@ def _lambda_nz(self, subs_nz: tuple, temporal: bool = True) -> np.ndarray: def _ps_likelihood( self, - data: Union[COO, np.ndarray], + data: GraphDataType, data_T: COO, mask: Optional[np.ndarray] = None, ): @@ -542,7 +542,7 @@ def _check_for_convergence( Parameters ---------- - data : Union[COO, np.ndarray] + data : GraphDataType Graph adjacency tensor. it : int Current iteration number. @@ -560,7 +560,7 @@ def _check_for_convergence( Indices of elements of data that are non-zero. T : Optional[int] Number of time steps. - data_T : Optional[Union[COO, np.ndarray]] + data_T : Optional[GraphDataType] Graph adjacency tensor (transpose). mask : Optional[np.ndarray] Mask for selecting the held out set in the adjacency tensor in case of cross-validation. @@ -848,13 +848,13 @@ def get_params_to_load_data(self, args: Namespace) -> Dict[str, Any]: return {f: getattr(args, f) for f in fields} @singledispatchmethod - def get_data_sum(self, data) -> float: + def get_data_sum(self, data: GraphDataType) -> float: """ Compute the sum of the data. Parameters ---------- - data : Union[np.ndarray, COO] + data : GraphDataType The data to sum. Returns @@ -904,13 +904,13 @@ def _(self, data: COO) -> float: return data.data.sum() @singledispatchmethod - def get_data_toarray(self, data) -> np.ndarray: + def get_data_toarray(self, data: GraphDataType) -> np.ndarray: """ Convert the data to a numpy array. Parameters ---------- - data : Union[np.ndarray, COO] + data : GraphDataType The data to convert. Returns @@ -960,13 +960,13 @@ def _(self, data: COO) -> np.ndarray: return data.toarray() @singledispatchmethod - def get_data_nonzero(self, data) -> tuple: + def get_data_nonzero(self, data: GraphDataType) -> tuple: """ Get the indices of non-zero elements in the data. Parameters ---------- - data : Union[np.ndarray, COO] + data : GraphDataType The data to get non-zero indices from. Returns diff --git a/probinet/models/constants.py b/probinet/models/constants.py index 51c7ebb..05ebebe 100644 --- a/probinet/models/constants.py +++ b/probinet/models/constants.py @@ -2,6 +2,8 @@ This file contains the constants used in the models. """ +from pathlib import Path + # Constants EPS_ = 1e-12 # Small value to avoid division by zero INF_ = 1e10 # Large value to represent infinity @@ -9,3 +11,9 @@ CONVERGENCE_TOL_ = 1e-4 # Convergence threshold for the optimization algorithm ERR_ = 0.1 # Noise for the initialization DECISION_ = 10 # Convergence parameter +OUTPUT_FOLDER = Path("outputs") +RTOL_DEFAULT = 1e-05 +ATOL_DEFAULT = 1e-08 +AG_DEFAULT = 1.5 +BG_DEFAULT = 10.0 +K_DEFAULT = 3 diff --git a/probinet/models/crep.py b/probinet/models/crep.py index 4bd8159..af0b355 100644 --- a/probinet/models/crep.py +++ b/probinet/models/crep.py @@ -15,10 +15,16 @@ from .base import ModelBase, ModelUpdateMixin from .classes import GraphData +from .constants import OUTPUT_FOLDER from ..evaluation.expectation_computation import compute_mean_lambda0 from ..input.preprocessing import preprocess_adjacency_tensor -from ..types import GraphDataType -from ..types import GraphDataType +from ..types import ( + GraphDataType, + MaskType, + EndFileType, + FilesType, + ArraySequence, +) from ..utils.matrix_operations import sp_uttkrp, sp_uttkrp_assortative from ..utils.tools import get_item_array_from_subs, log_and_raise_error @@ -28,7 +34,6 @@ class CRep(ModelBase, ModelUpdateMixin): Class to perform inference in networks with reciprocity. """ - def __init__( self, max_iter: int = 1000, @@ -88,18 +93,18 @@ def fit( gdata: GraphData, rseed: int = 0, K: int = 3, - mask: Optional[np.ndarray] = None, + mask: Optional[MaskType] = None, initialization: int = 0, - eta0: Union[float, None] = None, + eta0: Optional[float] = None, undirected: bool = False, assortative: bool = True, constrained: bool = True, out_inference: bool = True, - out_folder: Path = Path("outputs"), - end_file: str = None, fix_eta: bool = False, fix_w: bool = False, - files: Optional[str] = None, + out_folder: Path = OUTPUT_FOLDER, + end_file: Optional[EndFileType] = None, + files: Optional[FilesType] = None, **_kwargs: Any, ) -> tuple[ ndarray[Any, dtype[np.float64]], @@ -125,7 +130,7 @@ def fit( Random seed, by default 0. K : int, optional Number of communities, by default 3. - mask : Optional[np.ndarray], optional + mask : MaskType, optional Mask for selecting the held-out set in the adjacency tensor in case of cross-validation, by default None. initialization : int, optional Initialization method for the models parameters, by default 0. @@ -276,9 +281,9 @@ def _initialize_realization(self) -> Tuple[int, bool, int, float, List[float]]: def _preprocess_data_for_fit( self, data: GraphDataType, - data_T: Union[COO, np.ndarray, None], + data_T: Union[GraphDataType, None], data_T_vals: Union[np.ndarray, None], - ) -> Tuple[int, Any, Any, np.ndarray, Tuple[np.ndarray]]: + ) -> tuple[float, COO | ndarray, COO | ndarray, ndarray | None, tuple]: """ Preprocess the data for fitting the models. @@ -331,14 +336,14 @@ def _update_cache( self, data: GraphDataType, data_T_vals: np.ndarray, - subs_nz: Tuple[np.ndarray], + subs_nz: ArraySequence, ) -> None: """ Update the cache used in the em_update. Parameters ---------- - data : Union[COO, np.ndarray] + data : GraphDataType Graph adjacency tensor. data_T_vals : ndarray Array with values of entries A[j, i] given non-zero entry (i, j). @@ -358,7 +363,7 @@ def _update_em(self): Parameters ---------- - data : Union[COO, np.ndarray] + data : GraphDataType Graph adjacency tensor. data_T_vals : ndarray Array with values of entries A[j, i] given non-zero entry (i, j). @@ -420,7 +425,7 @@ def _update_eta( Parameters ---------- - data : Union[COO, np.ndarray] + data : GraphDataType Graph adjacency tensor. data_T_vals : ndarray Array with values of entries A[j, i] given non-zero entry (i, j). @@ -531,7 +536,7 @@ def _specific_update_W_assortative(self): non_zeros = Z > 0 self.w[non_zeros] /= Z[non_zeros] - def _update_membership(self, subs_nz: Tuple[np.ndarray], m: int) -> np.ndarray: + def _update_membership(self, subs_nz: ArraySequence, m: int) -> np.ndarray: """ Return the Khatri-Rao product (sparse version) used in the update of the membership matrices. @@ -566,16 +571,16 @@ def _ps_likelihood( self, data: GraphDataType, data_T: COO, - mask: Optional[np.ndarray] = None, + mask: Optional[MaskType] = None, ) -> float: """ Compute the pseudo log-likelihood of the data. Parameters ---------- - data : Union[COO, np.ndarray] + data : GraphDataType Graph adjacency tensor. - data_T : Union[COO, np.ndarray] + data_T : GraphDataType Graph adjacency tensor (transpose). mask : ndarray Mask for selecting the held out set in the adjacency tensor in case of diff --git a/probinet/models/dyncrep.py b/probinet/models/dyncrep.py index 0e79d97..bec974c 100644 --- a/probinet/models/dyncrep.py +++ b/probinet/models/dyncrep.py @@ -6,7 +6,7 @@ import time from argparse import Namespace from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, Optional, Tuple import numpy as np from scipy.optimize import brentq, root @@ -14,14 +14,20 @@ from .base import ModelBase, ModelUpdateMixin from .classes import GraphData -from .constants import EPS_ +from .constants import EPS_, OUTPUT_FOLDER from ..evaluation.expectation_computation import ( compute_lagrange_multiplier, compute_mean_lambda0, u_with_lagrange_multiplier, ) from ..input.preprocessing import preprocess_adjacency_tensor -from ..types import GraphDataType +from ..types import ( + GraphDataType, + MaskType, + EndFileType, + FilesType, + ArraySequence, +) from ..utils.matrix_operations import sp_uttkrp, sp_uttkrp_assortative from ..utils.tools import get_item_array_from_subs, log_and_raise_error @@ -68,15 +74,6 @@ def get_params_to_load_data(self, args: Namespace) -> Dict[str, Any]: return data_kwargs - def get_params_to_load_data(self, args: Namespace) -> Dict[str, Any]: - # Get the parameters for loading the data - data_kwargs = super().get_params_to_load_data(args) - - # Additional fields - data_kwargs["T"] = args.T - - return data_kwargs - def _check_fit_params( self, **kwargs: Any, @@ -136,7 +133,7 @@ def fit( self, gdata: GraphData, T: Optional[int] = None, - mask: Optional[np.ndarray] = None, + mask: Optional[MaskType] = None, K: int = 2, rseed: int = 0, ag: float = 1.0, @@ -155,9 +152,9 @@ def fit( fix_w: bool = False, undirected: bool = False, out_inference: bool = True, - out_folder: Path = Path("outputs"), - end_file: str = None, - files: str = None, + out_folder: Path = OUTPUT_FOLDER, + end_file: Optional[EndFileType] = None, + files: Optional[FilesType] = None, **_kwargs: Any, ) -> Tuple[ np.ndarray, @@ -173,13 +170,13 @@ def fit( Parameters ---------- - data : Union[COO, np.ndarray] + data : GraphDataType Graph adjacency tensor. T : int Number of time steps. nodes : List[int] List of node IDs. - mask : Optional[np.ndarray], optional + mask : MaskType, optional Mask for selecting the held-out set in the adjacency tensor in case of cross-validation, by default None. K : int, optional Number of communities, by default 2. @@ -364,7 +361,7 @@ def _initialize_realization(self): def _preprocess_data_for_fit(self, T: int, data: GraphDataType) -> Tuple[ int, - Union[COO, np.ndarray], + GraphDataType, np.ndarray, np.ndarray, np.ndarray, @@ -378,7 +375,7 @@ def _preprocess_data_for_fit(self, T: int, data: GraphDataType) -> Tuple[ ---------- T : int Number of time steps. - data : Union[COO, np.ndarray] + data : GraphDataType The input data tensor. temporal : bool Flag to determine if the function should behave in a temporal manner. @@ -521,14 +518,14 @@ def _update_cache( self, data: GraphDataType, data_T_vals: np.ndarray, - subs_nz: Tuple[np.ndarray], + subs_nz: ArraySequence, ) -> None: """ Update the cache used in the em_update. Parameters ---------- - data : Union[COO, np.ndarray] + data : GraphDataType Graph adjacency tensor. data_T_vals : ndarray Array with values of entries A[j, i] given non-zero entry (i, j). @@ -981,8 +978,8 @@ def _likelihood( # type: ignore data: GraphDataType, data_T: GraphDataType, data_T_vals: np.ndarray, - subs_nz: Tuple[np.ndarray], - mask: Optional[np.ndarray] = None, + subs_nz: ArraySequence, + mask: Optional[MaskType] = None, ) -> float: # The inputs do change sometimes, so keeping it like this to avoid # conflicts during the execution. """ @@ -990,17 +987,17 @@ def _likelihood( # type: ignore Parameters ---------- - data : Union[COO, np.ndarray] + data : GraphDataType Graph adjacency tensor. - data_T : Union[COO, np.ndarray] + data_T : GraphDataType Graph adjacency tensor (transpose). data_T_vals : np.ndarray Array with values of entries A[j, i] given non-zero entry (i, j). - subs_nz : Tuple[np.ndarray] + subs_nz : TupleArrays Indices of elements of data that are non-zero. T : int Number of time steps. - mask : Optional[np.ndarray] + mask : MaskType Mask for selecting the held out set in the adjacency tensor in case of cross-validation. EPS : float, default 1e-12 Small constant to prevent division by zero. diff --git a/probinet/models/jointcrep.py b/probinet/models/jointcrep.py index ce3a9b7..e607994 100644 --- a/probinet/models/jointcrep.py +++ b/probinet/models/jointcrep.py @@ -7,17 +7,16 @@ import logging import time from pathlib import Path -from typing import Any, List, Tuple, Union +from typing import Any, Tuple, Union, Optional import numpy as np -from sparse import COO from .base import ModelBase, ModelUpdateMixin from .classes import GraphData +from .constants import OUTPUT_FOLDER, K_DEFAULT from ..evaluation.expectation_computation import compute_mean_lambda0 from ..input.preprocessing import preprocess_adjacency_tensor -from ..types import GraphDataType -from ..types import GraphDataType +from ..types import GraphDataType, EndFileType, FilesType from ..utils.matrix_operations import sp_uttkrp, sp_uttkrp_assortative, transpose_tensor from ..utils.tools import check_symmetric, get_item_array_from_subs, log_and_raise_error @@ -64,9 +63,9 @@ def fit( self, gdata: GraphData, rseed: int = 0, - K: int = 3, + K: int = K_DEFAULT, initialization: int = 0, - eta0: Union[float, None] = None, + eta0: Optional[float] = None, undirected: bool = False, assortative: bool = True, fix_eta: bool = False, @@ -74,9 +73,9 @@ def fit( fix_w: bool = False, use_approximation: bool = False, out_inference: bool = True, - out_folder: Path = Path("outputs"), - end_file: str = None, - files: str = None, + out_folder: Path = OUTPUT_FOLDER, + end_file: Optional[EndFileType] = None, + files: Optional[FilesType] = None, **_kwargs: Any, ) -> tuple[ np.ndarray[Any, np.dtype[np.float64]], diff --git a/probinet/models/mtcov.py b/probinet/models/mtcov.py index 9a3ff33..e921734 100644 --- a/probinet/models/mtcov.py +++ b/probinet/models/mtcov.py @@ -10,7 +10,7 @@ import time from argparse import Namespace from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple import numpy as np import scipy.sparse @@ -18,11 +18,11 @@ from .base import ModelBase, ModelUpdateMixin from .classes import GraphData +from .constants import OUTPUT_FOLDER from ..evaluation.expectation_computation import compute_mean_lambda0 from ..input.loader import build_adjacency_and_design_from_file from ..input.preprocessing import preprocess_adjacency_tensor, preprocess_data_matrix -from ..types import GraphDataType -from ..types import GraphDataType +from ..types import GraphDataType, EndFileType, FilesType, ArraySequence from ..utils.matrix_operations import sp_uttkrp, sp_uttkrp_assortative MAX_BATCH_SIZE = 5000 @@ -51,7 +51,7 @@ def __init__( self.__doc__ = ModelBase.__init__.__doc__ - def load_data(self, files, adj_name, **kwargs): + def load_data(self, files: str, adj_name: str, **kwargs): """ Load data from the input folder. """ @@ -109,9 +109,9 @@ def fit( undirected: bool = False, assortative: bool = False, out_inference: bool = True, - out_folder: Path = Path("outputs"), - end_file: str = None, - files: str = None, + out_folder: Path = OUTPUT_FOLDER, + end_file: Optional[EndFileType] = None, + files: Optional[FilesType] = None, **__kwargs: Any, ) -> tuple[ np.ndarray[Any, np.dtype[np.float64]], @@ -126,7 +126,7 @@ def fit( Parameters ---------- - data : Union[COO, np.ndarray] + data : GraphDataType Graph adjacency tensor. data_X : np.ndarray Object representing the one-hot encoding version of the design matrix. @@ -281,13 +281,13 @@ def preprocess_data_for_fit( data_X: np.ndarray, batch_size: Optional[int] = None, ) -> Tuple[ - Union[COO, np.ndarray], + GraphDataType, np.ndarray, - Tuple[np.ndarray], - Tuple[np.ndarray], + ArraySequence, + ArraySequence, Optional[np.ndarray], - Optional[Tuple[np.ndarray]], - Optional[Tuple[np.ndarray]], + Optional[ArraySequence], + Optional[ArraySequence], ]: """ Preprocesses the input data for fitting the models. @@ -297,7 +297,7 @@ def preprocess_data_for_fit( Parameters ---------- - data : Union[COO, np.ndarray] + data : GraphDataType The graph adjacency tensor to be preprocessed. data_X : np.ndarray The one-hot encoding version of the design matrix to be preprocessed. @@ -308,19 +308,19 @@ def preprocess_data_for_fit( The maximum batch size to use when automatically determining the batch Returns ------- - preprocessed_data : Union[COO, np.ndarray] + preprocessed_data : GraphDataType The preprocessed graph adjacency tensor. preprocessed_data_X : np.ndarray The preprocessed one-hot encoding version of the design matrix. - subs_nz : Tuple[np.ndarray] + subs_nz : TupleArrays The indices of the non-zero entries in the data. - subs_X_nz : Tuple[np.ndarray] + subs_X_nz : TupleArrays The indices of the non-zero entries in the design matrix. subset_N : Optional[np.ndarray] The subset of nodes selected for batch processing. None if no subset is selected. - Subs : Optional[Tuple[np.ndarray]] + Subs : Optional[TupleArrays] The list of tuples representing the non-zero entries in the data. None if no subset is selected. - SubsX : Optional[Tuple[np.ndarray]] + SubsX : Optional[TupleArrays] The list of tuples representing the non-zero entries in the design matrix. None if no subset is selected. """ @@ -337,7 +337,7 @@ def preprocess_data_for_fit( logging.debug("batch_size: %s", batch_size) - return data, data_X, subs_nz, subs_X_nz, subset_N, Subs, SubsX # type: ignore + return data, data_X, subs_nz, subs_X_nz, subset_N, Subs, SubsX def _initialize_realization(self): """ @@ -463,22 +463,22 @@ def _get_data_pi_nz(self, data_X, subs_X_nz): def _update_cache( self, data: GraphDataType, - subs_nz: Tuple[np.ndarray], + subs_nz: ArraySequence, data_X: np.ndarray, - subs_X_nz: Tuple[np.ndarray], + subs_X_nz: ArraySequence, ) -> None: """ Update the cache used in the em_update. Parameters ---------- - data : Union[COO, np.ndarray] + data : GraphDataType Graph adjacency tensor. - subs_nz : Tuple[np.ndarray] + subs_nz : TupleArrays Indices of elements of data that are non-zero. data_X : np.ndarray Object representing the one-hot encoding version of the design matrix. - subs_X_nz : Tuple[np.ndarray] + subs_X_nz : TupleArrays Indices of elements of data_X that are non-zero. """ @@ -494,7 +494,7 @@ def _update_cache( def _pi0_nz( self, - subs_X_nz: Tuple[np.ndarray], + subs_X_nz: ArraySequence, u: np.ndarray, v: np.ndarray, beta: np.ndarray, @@ -562,7 +562,7 @@ def _specific_update_W_assortative(self): for a in range(self.L): self.w[a, non_zeros] /= Z[non_zeros] - def _update_beta(self, subs_X_nz: Tuple[np.ndarray]) -> float: + def _update_beta(self, subs_X_nz: ArraySequence) -> float: """ Update beta matrix. @@ -616,8 +616,8 @@ def _specific_update_V(self) -> None: def _update_membership( self, - subs_nz: Tuple[np.ndarray], - subs_X_nz: Tuple[np.ndarray], + subs_nz: ArraySequence, + subs_X_nz: ArraySequence, u: np.ndarray, v: np.ndarray, w: np.ndarray, @@ -721,7 +721,7 @@ def _likelihood_batch( Parameters ---------- - data : Union[COO, np.ndarray] + data : GraphDataType Graph adjacency tensor. data_X : ndarray Object representing the one-hot encoding version of the design matrix. @@ -767,9 +767,7 @@ def _likelihood_batch( else: return loglik - def _check_for_convergence_delta( - self, it, coincide, du, dv, dw, db, convergence - ): # pylint: disable=arguments-renamed + def _check_for_convergence_delta(self, it, coincide, du, dv, dw, db, convergence): """ Check for convergence by using the maximum distances between the old and the new parameters values. diff --git a/probinet/synthetic/anomaly.py b/probinet/synthetic/anomaly.py index 6ba5b2d..6b81520 100644 --- a/probinet/synthetic/anomaly.py +++ b/probinet/synthetic/anomaly.py @@ -3,6 +3,7 @@ """ import logging +from pathlib import Path from typing import Optional import matplotlib.pyplot as plt @@ -11,10 +12,11 @@ from scipy import sparse from scipy.optimize import brentq -from probinet.synthetic.base import GraphProcessingMixin, affinity_matrix -from probinet.synthetic.dynamic import eq_c, membership_vectors -from probinet.utils.tools import flt -from probinet.visualization.plot import plot_M +from ..models.constants import OUTPUT_FOLDER +from ..synthetic.base import GraphProcessingMixin, affinity_matrix +from ..synthetic.dynamic import eq_c, membership_vectors +from ..utils.tools import flt +from ..visualization.plot import plot_M EPS = 1e-12 # Small value to avoid division by zero @@ -42,7 +44,7 @@ def __init__( corr: float = 0.0, over: float = 0.0, verbose: int = 0, - out_folder: str = None, + out_folder: Path = OUTPUT_FOLDER, output_parameters: bool = False, output_adj: bool = False, outfile_adj: Optional[str] = None, diff --git a/probinet/synthetic/base.py b/probinet/synthetic/base.py index ec0b76a..f1b315d 100644 --- a/probinet/synthetic/base.py +++ b/probinet/synthetic/base.py @@ -1,6 +1,11 @@ +""" +Base classes for synthetic network generation. +""" + from abc import ABCMeta import logging import math +from os import PathLike from enum import Enum from pathlib import Path from typing import Tuple, Optional, Union @@ -36,6 +41,7 @@ DEFAULT_SHOW_PLOTS = False DEFAULT_OUTPUT_NET = False + class Structure(Enum): ASSORTATIVE = "assortative" DISASSORTATIVE = "disassortative" @@ -259,11 +265,11 @@ def __init__( K: int = DEFAULT_K, seed: int = DEFAULT_SEED, eta: float = DEFAULT_ETA, - out_folder: Path = None, + out_folder: Optional[PathLike] = None, output_parameters: bool = DEFAULT_OUTPUT_NET, output_adj: bool = DEFAULT_OUTPUT_NET, - outfile_adj: str = None, - end_file: str = None, + outfile_adj: Optional[str] = None, + end_file: Optional[str] = None, show_details: bool = DEFAULT_SHOW_DETAILS, show_plots: bool = DEFAULT_SHOW_PLOTS, **kwargs, # these kwargs are needed later on @@ -550,14 +556,13 @@ def build_Y( nodes_to_remove = [] self.G = [] - self.layer_graphs = [] for layer in range(self.L): self.G.append(nx.from_numpy_array(Y[layer], create_using=nx.DiGraph())) Gc = max(nx.weakly_connected_components(self.G[layer]), key=len) nodes_to_remove.append(set(self.G[layer].nodes()).difference(Gc)) n_to_remove = nodes_to_remove[0].intersection(*nodes_to_remove) - self.layer_graphs = [] + self.layer_graphs: list[np.ndarray] = [] for layer in range(self.L): self._remove_nodes(self.G[layer], list(n_to_remove)) self.nodes = self._update_nodes_list(self.G[layer]) @@ -699,7 +704,7 @@ def _plot_M(self, cmap: str = "PuBuGn") -> None: def affinity_matrix( - structure: Union[Structure,str] = Structure.ASSORTATIVE.value, + structure: Union[Structure, str] = Structure.ASSORTATIVE.value, N: int = 100, K: int = 2, avg_degree: float = 4.0, diff --git a/probinet/synthetic/dynamic.py b/probinet/synthetic/dynamic.py index 6013a80..00ef441 100644 --- a/probinet/synthetic/dynamic.py +++ b/probinet/synthetic/dynamic.py @@ -4,6 +4,7 @@ import logging import math +from pathlib import Path from typing import List, Optional, Tuple import matplotlib.pyplot as plt @@ -17,7 +18,7 @@ from probinet.visualization.plot import plot_M from .base import GraphProcessingMixin, affinity_matrix -from ..models.constants import EPS_ +from ..models.constants import EPS_, OUTPUT_FOLDER from ..utils.matrix_operations import normalize_nonzero_membership @@ -45,7 +46,7 @@ def __init__( over: float = 0.0, label: Optional[str] = None, end_file: str = ".dat", - folder: str = "", + folder: Path = OUTPUT_FOLDER, structure: str = "assortative", output_parameters: bool = False, output_adj: bool = False, diff --git a/probinet/synthetic/multilayer.py b/probinet/synthetic/multilayer.py index 1c29760..685396b 100644 --- a/probinet/synthetic/multilayer.py +++ b/probinet/synthetic/multilayer.py @@ -2,10 +2,13 @@ Code to generate multilayer networks with non-negative and discrete weights, and whose nodes are associated with one categorical attribute. Self-loops are removed and only the largest connected component is considered. """ + import logging from abc import ABCMeta import os import warnings +from pathlib import Path +from typing import Optional import matplotlib.pyplot as plt import networkx as nx @@ -14,6 +17,7 @@ from probinet.evaluation.expectation_computation import compute_mean_lambda0 from probinet.input.stats import print_graph_stat +from probinet.models.constants import OUTPUT_FOLDER from probinet.synthetic.base import StandardMMSBM from probinet.utils.matrix_operations import normalize_nonzero_membership from probinet.utils.tools import output_adjacency @@ -106,7 +110,7 @@ def plot_X(X, cmap="PuBuGn"): fig, ax = plt.subplots(figsize=(7, 7)) ax.matshow(pd.get_dummies(X), cmap=plt.get_cmap(cmap), aspect="auto") - ax.set_title(f"Design matrix", fontsize=15) + ax.set_title("Design matrix", fontsize=15) for PCM in ax.get_children(): if isinstance(PCM, plt.cm.ScalarMappable): break @@ -128,7 +132,7 @@ def __init__( K: int = DEFAULT_K, Z: int = DEFAULT_Z, seed: int = DEFAULT_SEED, - out_folder: str = DEFAULT_OUT_FOLDER, + out_folder: Path = OUTPUT_FOLDER, output_net: bool = DEFAULT_OUTPUT_NET, show_details: bool = DEFAULT_SHOW_DETAILS, show_plots: bool = DEFAULT_SHOW_PLOTS, @@ -160,14 +164,9 @@ class SyntheticMTCOV(BaseSyntheticNetwork, StandardMMSBM): def __init__(self, **kwargs): super().__init__(**kwargs) - if "parameters" in kwargs: - parameters = kwargs["parameters"] - else: - parameters = None - if "attributes" in kwargs: - attributes = kwargs["attributes"] - else: - attributes = None + + parameters = kwargs.get("parameters", None) + attributes = kwargs.get("attributes", None) self.init_mmsbm_params(**kwargs) @@ -349,8 +348,8 @@ def build_Y(self, parameters=None): # Generate Y self.M = compute_mean_lambda0(self.u, self.v, self.w) - for l in range(self.L): - np.fill_diagonal(self.M[l], 0) + for layer in range(self.L): + np.fill_diagonal(self.M[layer], 0) # sparsity parameter for Y if self.is_sparse: c = self.ExpEdges / self.M.sum() @@ -361,31 +360,31 @@ def build_Y(self, parameters=None): Y = self.prng.poisson(self.M) if not self.directed: # symmetrize - for l in range(self.L): - Y[l] = Y[l] + Y[l].T - np.diag(Y[l].diagonal()) + for layer in range(self.L): + Y[layer] = Y[layer] + Y[layer].T - np.diag(Y[layer].diagonal()) # Create networkx Graph objects for each layer for easier manipulation nodes_to_remove = [] self.G = [] self.layer_graphs = [] - for l in range(self.L): + for layer in range(self.L): if self.directed: - self.G.append(nx.from_numpy_array(Y[l], create_using=nx.DiGraph())) - Gc = max(nx.weakly_connected_components(self.G[l]), key=len) - nodes_to_remove.append(set(self.G[l].nodes()).difference(Gc)) + self.G.append(nx.from_numpy_array(Y[layer], create_using=nx.DiGraph())) + Gc = max(nx.weakly_connected_components(self.G[layer]), key=len) + nodes_to_remove.append(set(self.G[layer].nodes()).difference(Gc)) else: - self.G.append(nx.from_numpy_array(Y[l], create_using=nx.Graph())) + self.G.append(nx.from_numpy_array(Y[layer], create_using=nx.Graph())) if self.directed: n_to_remove = nodes_to_remove[0].intersection(*nodes_to_remove) - for l in range(self.L): + for layer in range(self.L): if self.directed: - self.G[l].remove_nodes_from(list(n_to_remove)) - self.nodes = list(self.G[l].nodes()) + self.G[layer].remove_nodes_from(list(n_to_remove)) + self.nodes = list(self.G[layer].nodes()) self.layer_graphs.append( - nx.to_scipy_sparse_array(self.G[l], nodelist=self.nodes) + nx.to_scipy_sparse_array(self.G[layer], nodelist=self.nodes) ) self.u = self.u[self.nodes] @@ -393,8 +392,9 @@ def build_Y(self, parameters=None): self.N = len(self.nodes) self.Y = Y[np.ix_(np.arange(self.L), self.nodes, self.nodes)] - def build_X(self, attributes: np.ndarray = None): + def build_X(self, attributes: Optional[np.ndarray] = None): """ + Generate the design matrix. Parameters @@ -449,16 +449,15 @@ def _generate_lv_attributes(self): return beta - def _plot_M(self, cmap="PuBuGn"): """ Plot the marginal means produced by the generative algorithm. """ - for l in range(self.L): + for layer in range(self.L): fig, ax = plt.subplots(figsize=(7, 7)) - ax.matshow(self.M[l], cmap=plt.get_cmap(cmap)) - ax.set_title(f"Marginal means matrix layer {l}", fontsize=15) + ax.matshow(self.M[layer], cmap=plt.get_cmap(cmap)) + ax.set_title(f"Marginal means matrix layer {layer}", fontsize=15) for PCM in ax.get_children(): if isinstance(PCM, plt.cm.ScalarMappable): break diff --git a/probinet/synthetic/reciprocity.py b/probinet/synthetic/reciprocity.py index a9e9b2b..945d096 100644 --- a/probinet/synthetic/reciprocity.py +++ b/probinet/synthetic/reciprocity.py @@ -7,11 +7,13 @@ import sys import warnings +from pathlib import Path from scipy.sparse import tril, triu from ..evaluation.expectation_computation import compute_mean_lambda0 from ..input.stats import reciprocal_edges +from ..models.constants import OUTPUT_FOLDER from ..utils.matrix_operations import ( Exp_ija_matrix, normalize_nonzero_membership, @@ -32,11 +34,11 @@ from ..input.stats import print_graph_stat from ..utils.tools import check_symmetric, log_and_raise_error, output_adjacency from .base import ( - BaseSyntheticNetwork, DEFAULT_ETA, StandardMMSBM, GraphProcessingMixin, affinity_matrix, + BaseSyntheticNetwork, ) if not sys.warnoptions: @@ -62,11 +64,11 @@ def __init__( beta: float = 0.1, Normalization: int = 0, structure: str = "assortative", - end_file: str = None, - out_folder: str = None, + end_file: Optional[str] = None, + out_folder: Path = OUTPUT_FOLDER, output_parameters: bool = False, output_adj: bool = False, - outfile_adj: str = None, + outfile_adj: Optional[str] = None, ExpM: Optional[float] = None, ): """ diff --git a/probinet/types.py b/probinet/types.py index e02dc95..e1682dd 100644 --- a/probinet/types.py +++ b/probinet/types.py @@ -1,6 +1,15 @@ -from typing import Union +""" +Custom type hints. +'""" +from os import PathLike +from typing import Union, Sequence import numpy as np from sparse import COO +EndFileType = str +FilesType = PathLike GraphDataType = Union[COO, np.ndarray] +MaskType = np.ndarray +SubsNzType = tuple[int, int, int] +ArraySequence = Sequence[np.ndarray] diff --git a/probinet/utils/tools.py b/probinet/utils/tools.py index 9e7f99c..f1d5022 100644 --- a/probinet/utils/tools.py +++ b/probinet/utils/tools.py @@ -3,14 +3,18 @@ """ import logging +from os import PathLike from pathlib import Path -from typing import Dict, List, Tuple, Type, Union +from typing import Dict, List, Type, Union import networkx as nx import numpy as np import pandas as pd from sparse import COO +from ..models.constants import RTOL_DEFAULT, ATOL_DEFAULT +from ..types import ArraySequence + def can_cast_to_int(string: Union[int, float, str]) -> bool: """ @@ -95,7 +99,7 @@ def sptensor_from_dense_array(X: np.ndarray) -> COO: return sparse_X -def get_item_array_from_subs(A: np.ndarray, ref_subs: Tuple[np.ndarray]) -> np.ndarray: +def get_item_array_from_subs(A: np.ndarray, ref_subs: ArraySequence) -> np.ndarray: """ Retrieves the values of specific entries in a dense tensor. Output is a 1-d array with dimension = number of non-zero entries. @@ -118,7 +122,9 @@ def get_item_array_from_subs(A: np.ndarray, ref_subs: Tuple[np.ndarray]) -> np.n def check_symmetric( - a: Union[np.ndarray, List[np.ndarray]], rtol: float = 1e-05, atol: float = 1e-08 + a: Union[np.ndarray, List[np.ndarray]], + rtol: float = RTOL_DEFAULT, + atol: float = ATOL_DEFAULT, ) -> bool: """ Check if a matrix or a list of matrices is symmetric. @@ -174,7 +180,7 @@ def build_edgelist(A: COO, layer: int) -> pd.DataFrame: return df_res -def output_adjacency(A: List, out_folder: Union[str | Path], label: str): +def output_adjacency(A: List, out_folder: PathLike, label: str): """ Save the adjacency tensor to a file. Default format is space-separated .csv with L+2 columns: source_node target_node diff --git a/tests/data/synthetic/test_anomaly_network_PB.yaml b/tests/data/synthetic/test_anomaly_network_PB.yaml index 9433b32..042c452 100644 --- a/tests/data/synthetic/test_anomaly_network_PB.yaml +++ b/tests/data/synthetic/test_anomaly_network_PB.yaml @@ -3,7 +3,7 @@ K: 2 m: 1 rseed: 10 label: "10_2_4.0_0.1" -out_folder: null +out_folder: "outputs" output_parameters: false output_adj: false outfile_adj: null diff --git a/tests/data/synthetic/test_anomaly_network_PB_with_parameters.yaml b/tests/data/synthetic/test_anomaly_network_PB_with_parameters.yaml index b23043f..47b2bab 100644 --- a/tests/data/synthetic/test_anomaly_network_PB_with_parameters.yaml +++ b/tests/data/synthetic/test_anomaly_network_PB_with_parameters.yaml @@ -3,7 +3,7 @@ K: 3 m: 1 rseed: 10 label: "200_3_4.0_0.1" -out_folder: null +out_folder: "outputs" output_parameters: false output_adj: false outfile_adj: null diff --git a/tests/test_synthetic.py b/tests/test_synthetic.py index 3d54164..1d57c31 100644 --- a/tests/test_synthetic.py +++ b/tests/test_synthetic.py @@ -318,7 +318,7 @@ def assert_synt_net_anomaly_attributes(self, syn_acd, expected_values): self.assertEqual(syn_acd.m, expected_values["m"]) self.assertEqual(syn_acd.rseed, expected_values["rseed"]) self.assertEqual(syn_acd.label, expected_values["label"]) - self.assertEqual(syn_acd.out_folder, expected_values["out_folder"]) + self.assertEqual(syn_acd.out_folder.name, expected_values["out_folder"]) self.assertEqual( syn_acd.output_parameters, expected_values["output_parameters"] )