From 40a8893908e1860ba35ba2dae2ab82c6ac17d0ea Mon Sep 17 00:00:00 2001
From: seoanezonjic <darklordsone@hotmail.com>
Date: Mon, 27 May 2024 15:14:58 +0200
Subject: [PATCH 1/3] Added HLC_w to activate weighted networks and HLC_f which
 is a full implementation of Ahn algorithm using numpy and scipy.

---
 cdlib/algorithms/edge_clustering.py |  60 ++++++++-
 cdlib/algorithms/internal/HLC.py    | 188 +++++++++++++++++++++++++++-
 cdlib/utils.py                      |  14 +++
 3 files changed, 258 insertions(+), 4 deletions(-)

diff --git a/cdlib/algorithms/edge_clustering.py b/cdlib/algorithms/edge_clustering.py
index 54e55e91..93a005c3 100644
--- a/cdlib/algorithms/edge_clustering.py
+++ b/cdlib/algorithms/edge_clustering.py
@@ -1,10 +1,10 @@
 from cdlib import EdgeClustering
 from collections import defaultdict
 import networkx as nx
-from cdlib.utils import convert_graph_formats
-from cdlib.algorithms.internal.HLC import HLC, HLC_read_edge_list_unweighted
+from cdlib.utils import convert_graph_formats, nx_node_integer_mapping, remap_edge_communities
+from cdlib.algorithms.internal.HLC import HLC, HLC_read_edge_list_unweighted, HLC_read_edge_list_weighted, HLC_full
 
-__all__ = ["hierarchical_link_community"]
+__all__ = ["hierarchical_link_community", "hierarchical_link_community_w", "hierarchical_link_community_full"]
 
 
 def hierarchical_link_community(g_original: object) -> EdgeClustering:
@@ -50,3 +50,57 @@ def hierarchical_link_community(g_original: object) -> EdgeClustering:
 
     coms = [list(c) for c in coms.values()]
     return EdgeClustering(coms, g_original, "HLC", method_parameters={})
+
+
+def hierarchical_link_community_w(g_original: object) -> EdgeClustering:
+    """
+    HLC (hierarchical link clustering) is a method to classify links into topologically related groups.
+    The algorithm uses a similarity between links to build a dendrogram where each leaf is a link from the original network and branches represent link communities.
+    At each level of the link dendrogram is calculated the partition density function, based on link density inside communities, to pick the best level to cut.
+
+
+    **Supported Graph Types**
+
+    ========== ======== ========
+    Undirected Directed Weighted
+    ========== ======== ========
+    Yes        No       Yes
+    ========== ======== ========
+
+    :param g_original: a networkx/igraph object
+    :return: EdgeClustering object
+
+
+    :Example:
+
+    >>> from cdlib import algorithms
+    >>> import networkx as nx
+    >>> G = nx.karate_club_graph()
+    >>> com = algorithms.hierarchical_link_community_w(G)
+
+    :References:
+
+    Ahn, Yong-Yeol, James P. Bagrow, and Sune Lehmann. `Link communities reveal multiscale complexity in networks. <https://www.nature.com/articles/nature09182/>`_ nature 466.7307 (2010): 761.
+    """
+
+    g = convert_graph_formats(g_original, nx.Graph)
+
+    adj, edges, ij2wij = HLC_read_edge_list_weighted(g)
+    edge2cid, _, _, _ = HLC(adj, edges).single_linkage(w=ij2wij)
+
+    coms = defaultdict(list)
+    for e, com in edge2cid.items():
+        coms[com].append(e)
+
+    coms = [list(c) for c in coms.values()]
+    return EdgeClustering(coms, g_original, "HLC_w", method_parameters={})
+
+
+def hierarchical_link_community_full(g_original: object, weight='weight', simthr=None, hcmethod='single', min_edges= None, verbose=False) -> EdgeClustering:
+    g = convert_graph_formats(g_original, nx.Graph)
+    g_number, dictio = nx_node_integer_mapping(g)
+
+    coms = HLC_full(g_number, weight=weight, simthr=simthr, hcmethod=hcmethod, min_edges=min_edges, verbose=verbose, dictio= dictio).clusters
+    clustering = EdgeClustering(coms, g_number, "HLC_f", method_parameters={})
+    if dictio != None: clustering.communities = remap_edge_communities(clustering.communities, dictio)
+    return clustering
\ No newline at end of file
diff --git a/cdlib/algorithms/internal/HLC.py b/cdlib/algorithms/internal/HLC.py
index e20daf11..8a1ed0fc 100644
--- a/cdlib/algorithms/internal/HLC.py
+++ b/cdlib/algorithms/internal/HLC.py
@@ -2,7 +2,9 @@
 from heapq import heappush, heappop
 from itertools import combinations, chain
 from collections import defaultdict
-
+import networkx as nx
+import numpy as np
+from scipy.cluster.hierarchy import linkage
 
 # Hierarchical Link Community
 """
@@ -220,3 +222,187 @@ def cal_jaccard(intersect_val, left_val, right_val):
                 similarity_ratio = cal_jaccard(ai_dot_aj, n2a_sqrd[i], n2a_sqrd[j])
                 heappush(min_heap, (1 - similarity_ratio, edge_pair))
     return [heappop(min_heap) for _ in range(len(min_heap))]
+
+class HLC_full(object):
+    def __init__(self, net, weight='weight', simthr=None, hcmethod=None, min_edges=None, verbose=False, dictio=None ):
+        self.edge_counter = 0
+        self.clusters = self.edge_clustering(net, weight=weight, simthr=simthr, hcmethod=hcmethod, min_edges=min_edges, verbose=verbose, dictio = dictio)
+
+    def edge_clustering(self, net, weight=None, simthr=None, hcmethod=None, min_edges=None, verbose=False, dictio=None):
+        condensed_dist_vector, edge_matrix_len, edge_list = self.get_edge_similarity_matrix(net, weight=weight, simthr=simthr, verbose=verbose, dictio=dictio) # TODO: Add jaccard matrix without use weigths    
+        clustering = linkage(condensed_dist_vector, hcmethod)
+        final_clusters = self.get_clusters_by_partition_density(clustering, edge_matrix_len, edge_list, min_edges=min_edges)
+        return final_clusters
+
+    def get_edge_similarity_matrix(self, net, weight=None, simthr=None, verbose=False, dictio=None):
+        node_list = list(net.nodes())
+        node_list.sort()
+        adj = nx.adjacency_matrix(net, weight=weight, nodelist=node_list)
+        adj = adj.toarray() # This line is  needed as a change in csr format from matrix to array sparse. Diference in dot product if this line is removed!
+
+        if weight == None:
+            degree = np.sum(adj, axis=1)
+            np.fill_diagonal(adj, 1)
+        else:
+            degree = adj > 0
+            degree = np.sum(degree, axis=1)
+            weigth_sum = np.sum(adj, axis=1)
+            np.fill_diagonal(adj, weigth_sum/degree) # Ahn ecuation 4 in supplementary file
+
+        dotproduct = np.dot(adj, adj) # This efficiently calculates the vector products needed for tanimoto coefficient (ai and aj)
+        adj = None # Remove matrix in order to save memory
+        edge_dict = {}
+        data = []; col_i = []; col_j = [] # To save tanimoto similarities as sparse data
+        cache = {} # To save tanimoto similarity by pairs and avoid caclulate it when a pair is repeated
+        # k_node, i_neigh, j_neigh they are adj matrix indexes AND node names
+        edge_similarities = [] 
+        for k_node in node_list: # take node as reference and calculate tanimoto coeff for each pair of edges
+            neighbor_list = list(net.neighbors(k_node)) # take neighbors to get pairs and compare edges
+            neighbor_list.sort()
+            if len(neighbor_list) < 2: continue # Skip k nodes that has NOT enough neighbours to perform tanimoto
+            while len(neighbor_list) > 0:
+                i_neigh = neighbor_list.pop()            
+                for j_neigh in neighbor_list:
+                    if weight != None:
+                        sim = self.get_tanimoto_index(i_neigh, j_neigh, dotproduct, cache) 
+                    else:
+                        sim = self.get_jaccard_index(i_neigh, j_neigh, dotproduct, degree, cache) 
+                    if simthr != None and sim < simthr: continue
+                    pair = [self.get_edge_id(k_node, i_neigh, edge_dict), self.get_edge_id(k_node, j_neigh, edge_dict)]
+                    pair.sort()
+                    ki_edge_id, kj_edge_id = pair
+                    data.append(sim)
+                    if verbose:
+                        a_pair =  '_'.join(sorted([dictio[k_node], dictio[i_neigh]]))
+                        b_pair =  '_'.join(sorted([dictio[k_node], dictio[j_neigh]]))
+                        ids = sorted([a_pair, b_pair])
+                        edge_similarities.append(ids + [str(sim)])
+                    col_i.append(ki_edge_id)
+                    col_j.append(kj_edge_id)
+
+        if verbose:
+            with open('edge_scores.txt', 'w') as f: 
+                for e in edge_similarities: f.write("\t".join(e) + "\n")
+        condensed_dist_vector, edge_matrix_len = self.get_distance_condensed_vector(data, col_i, col_j)
+        return condensed_dist_vector, edge_matrix_len, list(edge_dict.keys())
+
+    def get_tanimoto_index(self, i_neigh, j_neigh, dotproduct, cache):
+        sim, pair = self.get_sim(i_neigh, j_neigh, cache)
+        if sim == None:
+            a_i2 = dotproduct[i_neigh,i_neigh]
+            a_j2 = dotproduct[j_neigh,j_neigh]
+            a_ij = dotproduct[i_neigh, j_neigh]
+            sim = a_ij /( a_i2 + a_j2 - a_ij)
+            cache[pair] = sim
+        return sim
+
+    def get_jaccard_index(self, i_neigh, j_neigh, dotproduct, degree, cache):
+        sim, pair = self.get_sim(i_neigh, j_neigh, cache)
+        if sim == None:
+            a_ij = dotproduct[i_neigh, j_neigh]
+            sim = a_ij /min(degree[i_neigh], degree[j_neigh])
+            cache[pair] = sim
+        return sim
+
+    def get_sim(self, i_neigh, j_neigh, cache):
+        pair = [i_neigh, j_neigh]
+        pair.sort()
+        pair = tuple(pair)
+        sim = cache.get(pair)
+        return sim, pair
+
+    def get_edge_id(self, a, b, e_dict):
+        e = [a, b]
+        e.sort()
+        edge_id = tuple(e)
+        e_index = e_dict.get(edge_id)
+        if e_index == None:
+            e_index = self.edge_counter
+            e_dict[edge_id] = e_index
+            self.edge_counter += 1
+        return e_index
+
+    def get_distance_condensed_vector(self, data, col_i, col_j):
+        edge_matrix_len = max([max(col_i), max(col_j)]) + 1 # Values in col_i and col_j are 0 based indexes so we need to add 1 to get the vector size
+        upper_triangle_size = (edge_matrix_len**2 - edge_matrix_len)//2
+        condensed_vector = np.ones(upper_triangle_size) 
+        for idx, sim in enumerate(data): # m * i + j - ((i + 2) * (i + 1)) / 2 from https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
+            i = col_i[idx]
+            j = col_j[idx]
+            v = edge_matrix_len * i + j - ((i + 2) * (i + 1)) // 2
+            condensed_vector[v] = 1 - sim
+        return condensed_vector, edge_matrix_len       
+
+    def get_clusters_by_partition_density(self, clustering, edge_len, edge_list, min_edges=None):
+        tree = {} # clust id : [member_ids]
+        edges_per_cluster = {} #clust_id : [ edge_tuples ]
+        partial_partition_densities = {} #clust_id : [ cluster_partition_density ]
+
+        counter = edge_len # this works as cluster id. This is used by the linkage method to tag the intermediate clusters: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
+        last_dist = None
+        constant = 2/edge_len
+        last_cluster_pool = []
+        max_pden = -10000000000
+        max_cluster_ids = []
+        for a_id, b_id, dist, n_members in clustering:
+            dist = round(dist, 5) # To make equal similar distances that differs in very low values
+            if last_dist != None and dist != last_dist: # We could have several clusters at the same dist, so we group the merge events to clculate the partition density
+                p_den = self.get_pden(last_cluster_pool, partial_partition_densities, constant) 
+                if p_den > max_pden: # check the best partition density
+                    max_pden = p_den
+                    max_cluster_ids = last_cluster_pool
+            a_id = int(a_id) # Linkage method returns member ids as float instead of int
+            b_id = int(b_id)
+            member_list = self.get_member_list(counter, a_id, b_id, edge_len, tree) # members that we merge to build the new agglomerative cluster
+            nodes, edges = self.get_nodesNedges_per_cluster(member_list, edge_list)
+            edges_per_cluster[counter] = edges
+            partial_partition_densities[counter] = self.get_cluster_partial_partition_density(member_list, nodes)
+            last_cluster_pool = [ cl_id for cl_id in last_cluster_pool if cl_id not in [a_id, b_id] ] # update clusters removin merged cl ids and adding the new cluters ids
+            last_cluster_pool.append(counter)
+            last_dist = dist
+            counter += 1
+            
+        p_den = self.get_pden(last_cluster_pool, partial_partition_densities, constant) # update clusters removin merged cl ids and adding the new cluters ids
+        if p_den > max_pden: # check the best partition density on the last distance that not was checked
+            max_pden = p_den
+            max_cluster_ids = last_cluster_pool
+
+        final_clusters = [ ]
+        for cluster_id in max_cluster_ids:
+            members = edges_per_cluster[cluster_id]
+            if min_edges == None or len(members) >= min_edges:
+                final_clusters.append(members)
+        return final_clusters
+
+    def add_cluster_members(self, cluster, member_id, n_records, tree):
+        if member_id < n_records: # check if member_id is a cluster with only one member that is a original record. That id is less than n_records is the criteria described in https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
+            cluster.append(member_id)
+        else: # The id represents the merge of two previous clusters. We obtain the member list from the tree and remove it to merge in the it in the new cluster
+            cluster.extend(tree.pop(member_id))
+
+    def get_member_list(self, cluster_id, a_id, b_id, edge_len, tree):
+        member_list = []
+        self.add_cluster_members(member_list, a_id, edge_len, tree) # get cluster members from previous a cluster
+        self.add_cluster_members(member_list, b_id, edge_len, tree) # get cluster members from previous b cluster
+        tree[cluster_id] = member_list
+        return member_list
+
+    def get_nodesNedges_per_cluster(self, members, edge_list):
+        nodes = []
+        edges = []
+        for member in members:
+            edge = edge_list[member]
+            edges.append(edge)
+            nodes.extend(edge) # Add edge nodes to node list
+        return list(set(nodes)), edges
+
+    def get_cluster_partial_partition_density(self, edges, nodes):
+        n = len(nodes) #node number
+        m = len(edges) # link number
+        #return (m-(n-1))/(n*(n-1)/(2-(n-1))) #Ahn
+        return (m*(m-n+1))/((n-2)*(n-1)) #kalinka
+
+    def get_pden(self, last_cluster_pool, partial_partition_densities, constant):
+        partition_den_sum = sum([ partial_partition_densities[cl_id] for cl_id in last_cluster_pool]) #Partition density
+        p_den = constant * partition_den_sum
+        return p_den
\ No newline at end of file
diff --git a/cdlib/utils.py b/cdlib/utils.py
index ae9b74a9..528f5909 100644
--- a/cdlib/utils.py
+++ b/cdlib/utils.py
@@ -239,6 +239,20 @@ def remap_node_communities(communities: object, node_map: dict) -> list:
     communities = cms
     return communities
 
+def remap_edge_communities(communities: object, node_map: dict) -> list: # ADDED TO HANDLE THIS CASE, VERSION FOR NODES CAN'T HANDLE THIS
+    """Apply a map to the obtained communities to retreive the original node labels
+
+    :param communities: EdgeClustering object
+    :param node_map: dictionary <numeric_id, node_label>
+    :return: remapped communities
+    """
+
+    cms = []
+    for community in communities:
+        community = [(node_map[e[0]], node_map[e[1]]) for e in community]
+        cms.append(community)
+    communities = cms
+    return communities
 
 def affiliations2nodesets(affiliations: dict) -> dict:
     """

From e7cd48b99ef76cabe3ddaeb2c8fab01097d374eb Mon Sep 17 00:00:00 2001
From: seoanezonjic <darklordsone@hotmail.com>
Date: Mon, 27 May 2024 16:06:14 +0200
Subject: [PATCH 2/3] added test to edgeclustering

---
 cdlib/test/test_edgeclustering.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/cdlib/test/test_edgeclustering.py b/cdlib/test/test_edgeclustering.py
index 8a591cd2..d78c07d5 100644
--- a/cdlib/test/test_edgeclustering.py
+++ b/cdlib/test/test_edgeclustering.py
@@ -12,6 +12,16 @@ def test_to_json(self):
         js = coms.to_json()
         self.assertIsInstance(js, str)
 
+        coms = algorithms.hierarchical_link_community_w(g)
+        self.assertIsInstance(coms, EdgeClustering)
+        js = coms.to_json()
+        self.assertIsInstance(js, str)
+
+        coms = algorithms.hierarchical_link_community_full(g)
+        self.assertIsInstance(coms, EdgeClustering)
+        js = coms.to_json()
+        self.assertIsInstance(js, str)
+
     def test_node_map(self):
         g = nx.karate_club_graph()
         coms = algorithms.hierarchical_link_community(g)

From aac30434fb3d6133c4bd1b5d9a936fca1a7dc726 Mon Sep 17 00:00:00 2001
From: seoanezonjic <darklordsone@hotmail.com>
Date: Mon, 27 May 2024 18:52:11 +0200
Subject: [PATCH 3/3] Add docstring to hierarchical_link_community_full

---
 cdlib/algorithms/edge_clustering.py | 35 +++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/cdlib/algorithms/edge_clustering.py b/cdlib/algorithms/edge_clustering.py
index 93a005c3..ec591c24 100644
--- a/cdlib/algorithms/edge_clustering.py
+++ b/cdlib/algorithms/edge_clustering.py
@@ -96,6 +96,41 @@ def hierarchical_link_community_w(g_original: object) -> EdgeClustering:
     return EdgeClustering(coms, g_original, "HLC_w", method_parameters={})
 
 
+    """
+    HLC (hierarchical link clustering) is a method to classify links into topologically related groups.
+    The algorithm uses a similarity between links to build a dendrogram where each leaf is a link from the original network and branches represent link communities.
+    At each level of the link dendrogram is calculated the partition density function, based on link density inside communities, to pick the best level to cut. 
+    This implementation follows exactly the algorithm described in Ahn et al and uses numpy/scipy to improve the clustering computation (It is faster and consumes less memory.
+
+
+    **Supported Graph Types**
+
+    ========== ======== ========
+    Undirected Directed Weighted
+    ========== ======== ========
+    Yes        No       Yes
+    ========== ======== ========
+
+    :param g_original: a networkx/igraph object
+    :weight: None for unweighted networks, jaccard approximation is used. When defined with a string, edge attribute name (usually 'weight') to be used as weight and Tanimoto approximation is used.
+    :simthr: None by default. If set to float, all values less than threshold are set to 0 in similarity matrix (it could reduce memory usage). 
+    :hcmethod: Linkage method used in hierarchical clustering, 'single' by default. See scipy.cluster.hierarchy.linkage to get full method list.
+    :min_edges: None by default. If set to float, minimum number of edges that a community must contain to be kept in the clustering
+    :verbose: If True, write intermediary steps to disk.
+    :return: EdgeClustering object
+
+
+    :Example:
+
+    >>> from cdlib import algorithms
+    >>> import networkx as nx
+    >>> G = nx.karate_club_graph()
+    >>> com = algorithms.hierarchical_link_community_full(G)
+
+    :References:
+
+    Ahn, Yong-Yeol, James P. Bagrow, and Sune Lehmann. `Link communities reveal multiscale complexity in networks. <https://www.nature.com/articles/nature09182/>`_ nature 466.7307 (2010): 761.
+    """
 def hierarchical_link_community_full(g_original: object, weight='weight', simthr=None, hcmethod='single', min_edges= None, verbose=False) -> EdgeClustering:
     g = convert_graph_formats(g_original, nx.Graph)
     g_number, dictio = nx_node_integer_mapping(g)