diff --git a/cdlib/algorithms/edge_clustering.py b/cdlib/algorithms/edge_clustering.py index ec591c2..2d76ebe 100644 --- a/cdlib/algorithms/edge_clustering.py +++ b/cdlib/algorithms/edge_clustering.py @@ -1,10 +1,23 @@ from cdlib import EdgeClustering from collections import defaultdict import networkx as nx -from cdlib.utils import convert_graph_formats, nx_node_integer_mapping, remap_edge_communities -from cdlib.algorithms.internal.HLC import HLC, HLC_read_edge_list_unweighted, HLC_read_edge_list_weighted, HLC_full - -__all__ = ["hierarchical_link_community", "hierarchical_link_community_w", "hierarchical_link_community_full"] +from cdlib.utils import ( + convert_graph_formats, + nx_node_integer_mapping, + remap_edge_communities, +) +from cdlib.algorithms.internal.HLC import ( + HLC, + HLC_read_edge_list_unweighted, + HLC_read_edge_list_weighted, + HLC_full, +) + +__all__ = [ + "hierarchical_link_community", + "hierarchical_link_community_w", + "hierarchical_link_community_full", +] def hierarchical_link_community(g_original: object) -> EdgeClustering: @@ -96,10 +109,18 @@ def hierarchical_link_community_w(g_original: object) -> EdgeClustering: return EdgeClustering(coms, g_original, "HLC_w", method_parameters={}) +def hierarchical_link_community_full( + g_original: object, + weight="weight", + simthr=None, + hcmethod="single", + min_edges=None, + verbose=False, +) -> EdgeClustering: """ HLC (hierarchical link clustering) is a method to classify links into topologically related groups. The algorithm uses a similarity between links to build a dendrogram where each leaf is a link from the original network and branches represent link communities. - At each level of the link dendrogram is calculated the partition density function, based on link density inside communities, to pick the best level to cut. + At each level of the link dendrogram is calculated the partition density function, based on link density inside communities, to pick the best level to cut. This implementation follows exactly the algorithm described in Ahn et al and uses numpy/scipy to improve the clustering computation (It is faster and consumes less memory. @@ -112,11 +133,11 @@ def hierarchical_link_community_w(g_original: object) -> EdgeClustering: ========== ======== ======== :param g_original: a networkx/igraph object - :weight: None for unweighted networks, jaccard approximation is used. When defined with a string, edge attribute name (usually 'weight') to be used as weight and Tanimoto approximation is used. - :simthr: None by default. If set to float, all values less than threshold are set to 0 in similarity matrix (it could reduce memory usage). - :hcmethod: Linkage method used in hierarchical clustering, 'single' by default. See scipy.cluster.hierarchy.linkage to get full method list. - :min_edges: None by default. If set to float, minimum number of edges that a community must contain to be kept in the clustering - :verbose: If True, write intermediary steps to disk. + :param weight: None for unweighted networks, jaccard approximation is used. When defined with a string, edge attribute name (usually 'weight') to be used as weight and Tanimoto approximation is used. + :param simthr: None by default. If set to float, all values less than threshold are set to 0 in similarity matrix (it could reduce memory usage). + :param hcmethod: Linkage method used in hierarchical clustering, 'single' by default. See scipy.cluster.hierarchy.linkage to get full method list. + :param min_edges: None by default. If set to float, minimum number of edges that a community must contain to be kept in the clustering + :param verbose: If True, write intermediary steps to disk. :return: EdgeClustering object @@ -131,11 +152,20 @@ def hierarchical_link_community_w(g_original: object) -> EdgeClustering: Ahn, Yong-Yeol, James P. Bagrow, and Sune Lehmann. `Link communities reveal multiscale complexity in networks. `_ nature 466.7307 (2010): 761. """ -def hierarchical_link_community_full(g_original: object, weight='weight', simthr=None, hcmethod='single', min_edges= None, verbose=False) -> EdgeClustering: + g = convert_graph_formats(g_original, nx.Graph) g_number, dictio = nx_node_integer_mapping(g) - coms = HLC_full(g_number, weight=weight, simthr=simthr, hcmethod=hcmethod, min_edges=min_edges, verbose=verbose, dictio= dictio).clusters + coms = HLC_full( + g_number, + weight=weight, + simthr=simthr, + hcmethod=hcmethod, + min_edges=min_edges, + verbose=verbose, + dictio=dictio, + ).clusters clustering = EdgeClustering(coms, g_number, "HLC_f", method_parameters={}) - if dictio != None: clustering.communities = remap_edge_communities(clustering.communities, dictio) - return clustering \ No newline at end of file + if dictio != None: + clustering.communities = remap_edge_communities(clustering.communities, dictio) + return clustering diff --git a/cdlib/algorithms/internal/HLC.py b/cdlib/algorithms/internal/HLC.py index 8a1ed0f..5851b19 100644 --- a/cdlib/algorithms/internal/HLC.py +++ b/cdlib/algorithms/internal/HLC.py @@ -223,22 +223,61 @@ def cal_jaccard(intersect_val, left_val, right_val): heappush(min_heap, (1 - similarity_ratio, edge_pair)) return [heappop(min_heap) for _ in range(len(min_heap))] + class HLC_full(object): - def __init__(self, net, weight='weight', simthr=None, hcmethod=None, min_edges=None, verbose=False, dictio=None ): + def __init__( + self, + net, + weight="weight", + simthr=None, + hcmethod=None, + min_edges=None, + verbose=False, + dictio=None, + ): self.edge_counter = 0 - self.clusters = self.edge_clustering(net, weight=weight, simthr=simthr, hcmethod=hcmethod, min_edges=min_edges, verbose=verbose, dictio = dictio) + self.clusters = self.edge_clustering( + net, + weight=weight, + simthr=simthr, + hcmethod=hcmethod, + min_edges=min_edges, + verbose=verbose, + dictio=dictio, + ) - def edge_clustering(self, net, weight=None, simthr=None, hcmethod=None, min_edges=None, verbose=False, dictio=None): - condensed_dist_vector, edge_matrix_len, edge_list = self.get_edge_similarity_matrix(net, weight=weight, simthr=simthr, verbose=verbose, dictio=dictio) # TODO: Add jaccard matrix without use weigths + def edge_clustering( + self, + net, + weight=None, + simthr=None, + hcmethod=None, + min_edges=None, + verbose=False, + dictio=None, + ): + ( + condensed_dist_vector, + edge_matrix_len, + edge_list, + ) = self.get_edge_similarity_matrix( + net, weight=weight, simthr=simthr, verbose=verbose, dictio=dictio + ) clustering = linkage(condensed_dist_vector, hcmethod) - final_clusters = self.get_clusters_by_partition_density(clustering, edge_matrix_len, edge_list, min_edges=min_edges) + final_clusters = self.get_clusters_by_partition_density( + clustering, edge_matrix_len, edge_list, min_edges=min_edges + ) return final_clusters - def get_edge_similarity_matrix(self, net, weight=None, simthr=None, verbose=False, dictio=None): + def get_edge_similarity_matrix( + self, net, weight=None, simthr=None, verbose=False, dictio=None + ): node_list = list(net.nodes()) node_list.sort() adj = nx.adjacency_matrix(net, weight=weight, nodelist=node_list) - adj = adj.toarray() # This line is needed as a change in csr format from matrix to array sparse. Diference in dot product if this line is removed! + adj = ( + adj.toarray() + ) # This line is needed as a change in csr format from matrix to array sparse. Diference in dot product if this line is removed! if weight == None: degree = np.sum(adj, axis=1) @@ -247,52 +286,78 @@ def get_edge_similarity_matrix(self, net, weight=None, simthr=None, verbose=Fals degree = adj > 0 degree = np.sum(degree, axis=1) weigth_sum = np.sum(adj, axis=1) - np.fill_diagonal(adj, weigth_sum/degree) # Ahn ecuation 4 in supplementary file - - dotproduct = np.dot(adj, adj) # This efficiently calculates the vector products needed for tanimoto coefficient (ai and aj) - adj = None # Remove matrix in order to save memory + np.fill_diagonal( + adj, weigth_sum / degree + ) # Ahn ecuation 4 in supplementary file + + dotproduct = np.dot( + adj, adj + ) # This efficiently calculates the vector products needed for tanimoto coefficient (ai and aj) + adj = None # Remove matrix in order to save memory edge_dict = {} - data = []; col_i = []; col_j = [] # To save tanimoto similarities as sparse data - cache = {} # To save tanimoto similarity by pairs and avoid caclulate it when a pair is repeated + data = [] + col_i = [] + col_j = [] # To save tanimoto similarities as sparse data + cache = ( + {} + ) # To save tanimoto similarity by pairs and avoid caclulate it when a pair is repeated # k_node, i_neigh, j_neigh they are adj matrix indexes AND node names - edge_similarities = [] - for k_node in node_list: # take node as reference and calculate tanimoto coeff for each pair of edges - neighbor_list = list(net.neighbors(k_node)) # take neighbors to get pairs and compare edges + edge_similarities = [] + for ( + k_node + ) in ( + node_list + ): # take node as reference and calculate tanimoto coeff for each pair of edges + neighbor_list = list( + net.neighbors(k_node) + ) # take neighbors to get pairs and compare edges neighbor_list.sort() - if len(neighbor_list) < 2: continue # Skip k nodes that has NOT enough neighbours to perform tanimoto + if len(neighbor_list) < 2: + continue # Skip k nodes that has NOT enough neighbours to perform tanimoto while len(neighbor_list) > 0: - i_neigh = neighbor_list.pop() + i_neigh = neighbor_list.pop() for j_neigh in neighbor_list: if weight != None: - sim = self.get_tanimoto_index(i_neigh, j_neigh, dotproduct, cache) + sim = self.get_tanimoto_index( + i_neigh, j_neigh, dotproduct, cache + ) else: - sim = self.get_jaccard_index(i_neigh, j_neigh, dotproduct, degree, cache) - if simthr != None and sim < simthr: continue - pair = [self.get_edge_id(k_node, i_neigh, edge_dict), self.get_edge_id(k_node, j_neigh, edge_dict)] + sim = self.get_jaccard_index( + i_neigh, j_neigh, dotproduct, degree, cache + ) + if simthr != None and sim < simthr: + continue + pair = [ + self.get_edge_id(k_node, i_neigh, edge_dict), + self.get_edge_id(k_node, j_neigh, edge_dict), + ] pair.sort() ki_edge_id, kj_edge_id = pair data.append(sim) if verbose: - a_pair = '_'.join(sorted([dictio[k_node], dictio[i_neigh]])) - b_pair = '_'.join(sorted([dictio[k_node], dictio[j_neigh]])) + a_pair = "_".join(sorted([dictio[k_node], dictio[i_neigh]])) + b_pair = "_".join(sorted([dictio[k_node], dictio[j_neigh]])) ids = sorted([a_pair, b_pair]) edge_similarities.append(ids + [str(sim)]) col_i.append(ki_edge_id) col_j.append(kj_edge_id) if verbose: - with open('edge_scores.txt', 'w') as f: - for e in edge_similarities: f.write("\t".join(e) + "\n") - condensed_dist_vector, edge_matrix_len = self.get_distance_condensed_vector(data, col_i, col_j) + with open("edge_scores.txt", "w") as f: + for e in edge_similarities: + f.write("\t".join(e) + "\n") + condensed_dist_vector, edge_matrix_len = self.get_distance_condensed_vector( + data, col_i, col_j + ) return condensed_dist_vector, edge_matrix_len, list(edge_dict.keys()) def get_tanimoto_index(self, i_neigh, j_neigh, dotproduct, cache): sim, pair = self.get_sim(i_neigh, j_neigh, cache) if sim == None: - a_i2 = dotproduct[i_neigh,i_neigh] - a_j2 = dotproduct[j_neigh,j_neigh] + a_i2 = dotproduct[i_neigh, i_neigh] + a_j2 = dotproduct[j_neigh, j_neigh] a_ij = dotproduct[i_neigh, j_neigh] - sim = a_ij /( a_i2 + a_j2 - a_ij) + sim = a_ij / (a_i2 + a_j2 - a_ij) cache[pair] = sim return sim @@ -300,7 +365,7 @@ def get_jaccard_index(self, i_neigh, j_neigh, dotproduct, degree, cache): sim, pair = self.get_sim(i_neigh, j_neigh, cache) if sim == None: a_ij = dotproduct[i_neigh, j_neigh] - sim = a_ij /min(degree[i_neigh], degree[j_neigh]) + sim = a_ij / min(degree[i_neigh], degree[j_neigh]) cache[pair] = sim return sim @@ -323,51 +388,75 @@ def get_edge_id(self, a, b, e_dict): return e_index def get_distance_condensed_vector(self, data, col_i, col_j): - edge_matrix_len = max([max(col_i), max(col_j)]) + 1 # Values in col_i and col_j are 0 based indexes so we need to add 1 to get the vector size - upper_triangle_size = (edge_matrix_len**2 - edge_matrix_len)//2 - condensed_vector = np.ones(upper_triangle_size) - for idx, sim in enumerate(data): # m * i + j - ((i + 2) * (i + 1)) / 2 from https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html + edge_matrix_len = ( + max([max(col_i), max(col_j)]) + 1 + ) # Values in col_i and col_j are 0 based indexes so we need to add 1 to get the vector size + upper_triangle_size = (edge_matrix_len**2 - edge_matrix_len) // 2 + condensed_vector = np.ones(upper_triangle_size) + for idx, sim in enumerate( + data + ): # m * i + j - ((i + 2) * (i + 1)) / 2 from https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html i = col_i[idx] j = col_j[idx] v = edge_matrix_len * i + j - ((i + 2) * (i + 1)) // 2 condensed_vector[v] = 1 - sim - return condensed_vector, edge_matrix_len + return condensed_vector, edge_matrix_len - def get_clusters_by_partition_density(self, clustering, edge_len, edge_list, min_edges=None): - tree = {} # clust id : [member_ids] - edges_per_cluster = {} #clust_id : [ edge_tuples ] - partial_partition_densities = {} #clust_id : [ cluster_partition_density ] + def get_clusters_by_partition_density( + self, clustering, edge_len, edge_list, min_edges=None + ): + tree = {} # clust id : [member_ids] + edges_per_cluster = {} # clust_id : [ edge_tuples ] + partial_partition_densities = {} # clust_id : [ cluster_partition_density ] - counter = edge_len # this works as cluster id. This is used by the linkage method to tag the intermediate clusters: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html + counter = edge_len # this works as cluster id. This is used by the linkage method to tag the intermediate clusters: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html last_dist = None - constant = 2/edge_len + constant = 2 / edge_len last_cluster_pool = [] max_pden = -10000000000 max_cluster_ids = [] for a_id, b_id, dist, n_members in clustering: - dist = round(dist, 5) # To make equal similar distances that differs in very low values - if last_dist != None and dist != last_dist: # We could have several clusters at the same dist, so we group the merge events to clculate the partition density - p_den = self.get_pden(last_cluster_pool, partial_partition_densities, constant) - if p_den > max_pden: # check the best partition density + dist = round( + dist, 5 + ) # To make equal similar distances that differs in very low values + if ( + last_dist != None and dist != last_dist + ): # We could have several clusters at the same dist, so we group the merge events to clculate the partition density + p_den = self.get_pden( + last_cluster_pool, partial_partition_densities, constant + ) + if p_den > max_pden: # check the best partition density max_pden = p_den max_cluster_ids = last_cluster_pool - a_id = int(a_id) # Linkage method returns member ids as float instead of int + a_id = int( + a_id + ) # Linkage method returns member ids as float instead of int b_id = int(b_id) - member_list = self.get_member_list(counter, a_id, b_id, edge_len, tree) # members that we merge to build the new agglomerative cluster + member_list = self.get_member_list( + counter, a_id, b_id, edge_len, tree + ) # members that we merge to build the new agglomerative cluster nodes, edges = self.get_nodesNedges_per_cluster(member_list, edge_list) edges_per_cluster[counter] = edges - partial_partition_densities[counter] = self.get_cluster_partial_partition_density(member_list, nodes) - last_cluster_pool = [ cl_id for cl_id in last_cluster_pool if cl_id not in [a_id, b_id] ] # update clusters removin merged cl ids and adding the new cluters ids + partial_partition_densities[ + counter + ] = self.get_cluster_partial_partition_density(member_list, nodes) + last_cluster_pool = [ + cl_id for cl_id in last_cluster_pool if cl_id not in [a_id, b_id] + ] # update clusters removin merged cl ids and adding the new cluters ids last_cluster_pool.append(counter) last_dist = dist counter += 1 - - p_den = self.get_pden(last_cluster_pool, partial_partition_densities, constant) # update clusters removin merged cl ids and adding the new cluters ids - if p_den > max_pden: # check the best partition density on the last distance that not was checked + + p_den = self.get_pden( + last_cluster_pool, partial_partition_densities, constant + ) # update clusters removin merged cl ids and adding the new cluters ids + if ( + p_den > max_pden + ): # check the best partition density on the last distance that not was checked max_pden = p_den max_cluster_ids = last_cluster_pool - final_clusters = [ ] + final_clusters = [] for cluster_id in max_cluster_ids: members = edges_per_cluster[cluster_id] if min_edges == None or len(members) >= min_edges: @@ -375,15 +464,21 @@ def get_clusters_by_partition_density(self, clustering, edge_len, edge_list, min return final_clusters def add_cluster_members(self, cluster, member_id, n_records, tree): - if member_id < n_records: # check if member_id is a cluster with only one member that is a original record. That id is less than n_records is the criteria described in https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html + if ( + member_id < n_records + ): # check if member_id is a cluster with only one member that is a original record. That id is less than n_records is the criteria described in https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html cluster.append(member_id) - else: # The id represents the merge of two previous clusters. We obtain the member list from the tree and remove it to merge in the it in the new cluster + else: # The id represents the merge of two previous clusters. We obtain the member list from the tree and remove it to merge in the it in the new cluster cluster.extend(tree.pop(member_id)) def get_member_list(self, cluster_id, a_id, b_id, edge_len, tree): member_list = [] - self.add_cluster_members(member_list, a_id, edge_len, tree) # get cluster members from previous a cluster - self.add_cluster_members(member_list, b_id, edge_len, tree) # get cluster members from previous b cluster + self.add_cluster_members( + member_list, a_id, edge_len, tree + ) # get cluster members from previous a cluster + self.add_cluster_members( + member_list, b_id, edge_len, tree + ) # get cluster members from previous b cluster tree[cluster_id] = member_list return member_list @@ -393,16 +488,18 @@ def get_nodesNedges_per_cluster(self, members, edge_list): for member in members: edge = edge_list[member] edges.append(edge) - nodes.extend(edge) # Add edge nodes to node list + nodes.extend(edge) # Add edge nodes to node list return list(set(nodes)), edges def get_cluster_partial_partition_density(self, edges, nodes): - n = len(nodes) #node number - m = len(edges) # link number - #return (m-(n-1))/(n*(n-1)/(2-(n-1))) #Ahn - return (m*(m-n+1))/((n-2)*(n-1)) #kalinka + n = len(nodes) # node number + m = len(edges) # link number + # return (m-(n-1))/(n*(n-1)/(2-(n-1))) #Ahn + return (m * (m - n + 1)) / ((n - 2) * (n - 1)) # kalinka def get_pden(self, last_cluster_pool, partial_partition_densities, constant): - partition_den_sum = sum([ partial_partition_densities[cl_id] for cl_id in last_cluster_pool]) #Partition density + partition_den_sum = sum( + [partial_partition_densities[cl_id] for cl_id in last_cluster_pool] + ) # Partition density p_den = constant * partition_den_sum - return p_den \ No newline at end of file + return p_den diff --git a/cdlib/utils.py b/cdlib/utils.py index 528f590..17a6e03 100644 --- a/cdlib/utils.py +++ b/cdlib/utils.py @@ -239,7 +239,10 @@ def remap_node_communities(communities: object, node_map: dict) -> list: communities = cms return communities -def remap_edge_communities(communities: object, node_map: dict) -> list: # ADDED TO HANDLE THIS CASE, VERSION FOR NODES CAN'T HANDLE THIS + +def remap_edge_communities( + communities: object, node_map: dict +) -> list: # ADDED TO HANDLE THIS CASE, VERSION FOR NODES CAN'T HANDLE THIS """Apply a map to the obtained communities to retreive the original node labels :param communities: EdgeClustering object @@ -254,6 +257,7 @@ def remap_edge_communities(communities: object, node_map: dict) -> list: # ADDED communities = cms return communities + def affiliations2nodesets(affiliations: dict) -> dict: """ Transform community format to nodesets