improve group betweenness centrality (#4435)

* In this PR I improved the group betweenness centrality algorithm by using the algorithm of Puzis et al. "Fast algorithm for successive computation of group betweenness centrality". In the previous implementation of the algorithm, for each pair of source - target non - group node permutation all the shortest paths were found and if one of them go through one of the nodes in the group then the group betweenness centrality increases. This was done for each group. Here i used Pusiz et al. implementation, which finds the GBC with running time of $O(K^3)$ for each group (k is the number of nodes in the group) and with a preprocessing time of $O(mn)$ for all the groups. This unable us to find the GBC of many groups in one preprocessing step. furthermore, the article shows how to find the prominent group of size k (the group with the highest GBC among those with size k), which i would add in future PR. * In this PR I improved the group betweenness centrality algorithm by using the algorithm of Puzis et al. "Fast algorithm for successive computation of group betweenness centrality". In the previous implementation of the algorithm, for each pair of source - target non - group node permutation all the shortest paths were found and if one of them go through one of the nodes in the group then the group betweenness centrality increases. This was done for each group. Here i used Pusiz et al. implementation, which finds the GBC with running time of $O(K^3)$ for each group (k is the number of nodes in the group) and with a preprocessing time of $O(mn)$ for all the groups. This unable us to find the GBC of many groups in one preprocessing step. furthermore, the article shows how to find the prominent group of size k (the group with the highest GBC among those with size k), which i would add in future PR. * In this PR I improved the group betweenness centrality algorithm by using the algorithm of Puzis et al. "Fast algorithm for successive computation of group betweenness centrality". In the previous implementation of the algorithm, for each pair of source - target non - group node permutation all the shortest paths were found and if one of them go through one of the nodes in the group then the group betweenness centrality increases. This was done for each group. Here i used Pusiz et al. implementation, which finds the GBC with running time of $O(K^3)$ for each group (k is the number of nodes in the group) and with a preprocessing time of $O(mn)$ for all the groups. This unable us to find the GBC of many groups in one preprocessing step. furthermore, the article shows how to find the prominent group of size k (the group with the highest GBC among those with size k), which i would add in future PR. * In this PR I improved the group betweenness centrality algorithm by using the algorithm of Puzis et al. "Fast algorithm for successive computation of group betweenness centrality". In the previous implementation of the algorithm, for each pair of source - target non - group node permutation all the shortest paths were found and if one of them go through one of the nodes in the group then the group betweenness centrality increases. This was done for each group. Here i used Pusiz et al. implementation, which finds the GBC with running time of $O(K^3)$ for each group (k is the number of nodes in the group) and with a preprocessing time of $O(mn)$ for all the groups. This unable us to find the GBC of many groups in one preprocessing step. furthermore, the article shows how to find the prominent group of size k (the group with the highest GBC among those with size k), which i would add in future PR. * backwards-compatible with the previous implementation * backwards-compatible with the previous implementation * After review * After review * After review * After review * improving the runtime of the algorithm and fixing bug * changing the help functions * changing the help functions * changing the help functions * change the helper function and now the output value for a single group is an integer * change the helper function and now the output value for a single group is an integer * change the function description * after review. Change normalize and docstring * divide by 2 if G is undirected * after changes
author: guy rozenberg <guyroznb@gmail.com> 2021-01-19 22:04:34 +0200
committer: GitHub <noreply@github.com> 2021-01-19 15:04:34 -0500
commit: 31c70c24d5ab657b02d453fbe3196071d15cd099 (patch)
tree: 3343d0f8beefaadc2f12a5cbde33bcd5994c65ce
parent: e5c4b3e382c63ed007b82fb48b3b0d34a8e16c54 (diff)
download: networkx-31c70c24d5ab657b02d453fbe3196071d15cd099.tar.gz
5 files changed, 211 insertions, 69 deletions
diff --git a/networkx/algorithms/centrality/betweenness.py b/networkx/algorithms/centrality/betweenness.py
index 6907f25b..87a843f0 100644
--- a/networkx/algorithms/centrality/betweenness.py
+++ b/networkx/algorithms/centrality/betweenness.py
@@ -124,14 +124,14 @@ def betweenness_centrality(
     for s in nodes:
         # single source shortest paths
         if weight is None:  # use BFS
-            S, P, sigma = _single_source_shortest_path_basic(G, s)
+            S, P, sigma, _ = _single_source_shortest_path_basic(G, s)
         else:  # use Dijkstra's algorithm
-            S, P, sigma = _single_source_dijkstra_path_basic(G, s, weight)
+            S, P, sigma, _ = _single_source_dijkstra_path_basic(G, s, weight)
         # accumulation
         if endpoints:
-            betweenness = _accumulate_endpoints(betweenness, S, P, sigma, s)
+            betweenness, delta = _accumulate_endpoints(betweenness, S, P, sigma, s)
         else:
-            betweenness = _accumulate_basic(betweenness, S, P, sigma, s)
+            betweenness, delta = _accumulate_basic(betweenness, S, P, sigma, s)
     # rescaling
     betweenness = _rescale(
         betweenness,
@@ -221,9 +221,9 @@ def edge_betweenness_centrality(G, k=None, normalized=True, weight=None, seed=No
     for s in nodes:
         # single source shortest paths
         if weight is None:  # use BFS
-            S, P, sigma = _single_source_shortest_path_basic(G, s)
+            S, P, sigma, _ = _single_source_shortest_path_basic(G, s)
         else:  # use Dijkstra's algorithm
-            S, P, sigma = _single_source_dijkstra_path_basic(G, s, weight)
+            S, P, sigma, _ = _single_source_dijkstra_path_basic(G, s, weight)
         # accumulation
         betweenness = _accumulate_edges(betweenness, S, P, sigma, s)
     # rescaling
@@ -268,7 +268,7 @@ def _single_source_shortest_path_basic(G, s):
             if D[w] == Dv + 1:  # this is a shortest path, count paths
                 sigma[w] += sigmav
                 P[w].append(v)  # predecessors
-    return S, P, sigma
+    return S, P, sigma, D
 
 
 def _single_source_dijkstra_path_basic(G, s, weight):
@@ -303,7 +303,7 @@ def _single_source_dijkstra_path_basic(G, s, weight):
             elif vw_dist == seen[w]:  # handle equal paths
                 sigma[w] += sigma[v]
                 P[w].append(v)
-    return S, P, sigma
+    return S, P, sigma, D
 
 
 def _accumulate_basic(betweenness, S, P, sigma, s):
@@ -315,7 +315,7 @@ def _accumulate_basic(betweenness, S, P, sigma, s):
             delta[v] += sigma[v] * coeff
         if w != s:
             betweenness[w] += delta[w]
-    return betweenness
+    return betweenness, delta
 
 
 def _accumulate_endpoints(betweenness, S, P, sigma, s):
@@ -328,7 +328,7 @@ def _accumulate_endpoints(betweenness, S, P, sigma, s):
             delta[v] += sigma[v] * coeff
         if w != s:
             betweenness[w] += delta[w] + 1
-    return betweenness
+    return betweenness, delta
 
 
 def _accumulate_edges(betweenness, S, P, sigma, s):
diff --git a/networkx/algorithms/centrality/betweenness_subset.py b/networkx/algorithms/centrality/betweenness_subset.py
index 70d8eecb..2d22398e 100644
--- a/networkx/algorithms/centrality/betweenness_subset.py
+++ b/networkx/algorithms/centrality/betweenness_subset.py
@@ -102,9 +102,9 @@ def betweenness_centrality_subset(G, sources, targets, normalized=False, weight=
     for s in sources:
         # single source shortest paths
         if weight is None:  # use BFS
-            S, P, sigma = shortest_path(G, s)
+            S, P, sigma, _ = shortest_path(G, s)
         else:  # use Dijkstra's algorithm
-            S, P, sigma = dijkstra(G, s, weight)
+            S, P, sigma, _ = dijkstra(G, s, weight)
         b = _accumulate_subset(b, S, P, sigma, s, targets)
     b = _rescale(b, len(G), normalized=normalized, directed=G.is_directed())
     return b
@@ -182,9 +182,9 @@ def edge_betweenness_centrality_subset(
     for s in sources:
         # single source shortest paths
         if weight is None:  # use BFS
-            S, P, sigma = shortest_path(G, s)
+            S, P, sigma, _ = shortest_path(G, s)
         else:  # use Dijkstra's algorithm
-            S, P, sigma = dijkstra(G, s, weight)
+            S, P, sigma, _ = dijkstra(G, s, weight)
         b = _accumulate_edges_subset(b, S, P, sigma, s, targets)
     for n in G:  # remove nodes to only return edges
         del b[n]
diff --git a/networkx/algorithms/centrality/group.py b/networkx/algorithms/centrality/group.py
index bd1d5f9e..9920352f 100644
--- a/networkx/algorithms/centrality/group.py
+++ b/networkx/algorithms/centrality/group.py
@@ -1,10 +1,15 @@
 """Group centrality measures."""
-from itertools import combinations
-
+from copy import deepcopy
 
 import networkx as nx
 from networkx.utils.decorators import not_implemented_for
 
+from networkx.algorithms.centrality.betweenness import (
+    _single_source_shortest_path_basic,
+    _single_source_dijkstra_path_basic,
+    _accumulate_endpoints,
+)
+
 
 __all__ = [
     "group_betweenness_centrality",
@@ -15,7 +20,7 @@ __all__ = [
 ]
 
 
-def group_betweenness_centrality(G, C, normalized=True, weight=None):
+def group_betweenness_centrality(G, C, normalized=True, weight=None, endpoints=False):
     r"""Compute the group betweenness centrality for a group of nodes.
 
     Group betweenness centrality of a group of nodes $C$ is the sum of the
@@ -23,7 +28,7 @@ def group_betweenness_centrality(G, C, normalized=True, weight=None):
 
     .. math::
 
-       c_B(C) =\sum_{s,t \in V-C; s<t} \frac{\sigma(s, t|C)}{\sigma(s, t)}
+       c_B(C) =\sum_{s,t \in V-C; s\neq t} \frac{\sigma(s, t|C)}{\sigma(s, t)}
 
     where $V$ is the set of nodes, $\sigma(s, t)$ is the number of
     shortest $(s, t)$-paths, and $\sigma(s, t|C)$ is the number of
@@ -36,19 +41,21 @@ def group_betweenness_centrality(G, C, normalized=True, weight=None):
     G : graph
       A NetworkX graph.
 
-    C : list or set
-      C is a group of nodes which belong to G, for which group betweenness
+    C : list or set or list of lists or list of sets
+      A group or a list of groups containing nodes which belong to G, for which group betweenness
       centrality is to be calculated.
 
     normalized : bool, optional
-      If True, group betweenness is normalized by `2/((|V|-|C|)(|V|-|C|-1))`
-      for graphs and `1/((|V|-|C|)(|V|-|C|-1))` for directed graphs where `|V|`
-      is the number of nodes in G and `|C|` is the number of nodes in C.
+      If True, group betweenness is normalized by `1/((|V|-|C|)(|V|-|C|-1))`
+      where `|V|` is the number of nodes in G and `|C|` is the number of nodes in C.
 
     weight : None or string, optional (default=None)
       If None, all edge weights are considered equal.
       Otherwise holds the name of the edge attribute used as weight.
 
+    endpoints : bool, optional
+      If True include the endpoints in the shortest path counts.
+
     Raises
     ------
     NodeNotFound
@@ -56,8 +63,9 @@ def group_betweenness_centrality(G, C, normalized=True, weight=None):
 
     Returns
     -------
-    betweenness : float
-       Group betweenness centrality of the group C.
+    betweenness : list of floats or float
+       If C is a single group then return a float. If C is a list with
+       several groups then return a list of group betweenness centralities.
 
     See Also
     --------
@@ -65,11 +73,9 @@ def group_betweenness_centrality(G, C, normalized=True, weight=None):
 
     Notes
     -----
-    The measure is described in [1]_.
-    The algorithm is an extension of the one proposed by Ulrik Brandes for
-    betweenness centrality of nodes. Group betweenness is also mentioned in
-    his paper [2]_ along with the algorithm. The importance of the measure is
-    discussed in [3]_.
+    Group betweenness centrality is described in [1]_ and its importance discussed in [3]_.
+    The initial implementation of the algorithm is mentioned in [2]_. This function uses
+     an improved algorithm presented in [4]_.
 
     The number of nodes in the group must be a maximum of n - 2 where `n`
     is the total number of nodes in the graph.
@@ -93,40 +99,133 @@ def group_betweenness_centrality(G, C, normalized=True, weight=None):
        Group Centrality Maximization via Network Design.
        SIAM International Conference on Data Mining, SDM 2018, 126–134.
        https://sites.cs.ucsb.edu/~arlei/pubs/sdm18.pdf
+    .. [4] Rami Puzis, Yuval Elovici, and Shlomi Dolev.
+       "Fast algorithm for successive computation of group betweenness centrality."
+        https://journals.aps.org/pre/pdf/10.1103/PhysRevE.76.056709
+
     """
-    betweenness = 0  # initialize betweenness to 0
-    V = set(G)  # set of nodes in G
-    C = set(C)  # set of nodes in C (group)
-    if len(C - V) != 0:  # element(s) of C not in V
-        raise nx.NodeNotFound(
-            "The node(s) " + str(list(C - V)) + " are not " "in the graph."
-        )
-    V_C = V - C  # set of nodes in V but not in C
-    # accumulation
-    for pair in combinations(V_C, 2):  # (s, t) pairs of V_C
-        try:
-            paths = 0
-            paths_through_C = 0
-            for path in nx.all_shortest_paths(
-                G, source=pair[0], target=pair[1], weight=weight
-            ):
-                if set(path) & C:
-                    paths_through_C += 1
-                paths += 1
-            betweenness += paths_through_C / paths
-        except nx.exception.NetworkXNoPath:
-            betweenness += 0
-    # rescaling
-    v, c = len(G), len(C)
-    if normalized:
-        scale = 1 / ((v - c) * (v - c - 1))
-        if not G.is_directed():
-            scale *= 2
+    GBC = []  # initialize betweenness
+    list_of_groups = True
+    #  check weather C contains one or many groups
+    if any(el in G for el in C):
+        C = [C]
+        list_of_groups = False
+    set_v = {node for group in C for node in group}
+    if set_v - G.nodes:  # element(s) of C not in G
+        raise nx.NodeNotFound(f"The node(s) {set_v - G.nodes} are in C but not in G.")
+
+    # pre-processing
+    PB, sigma, D = _group_preprocessing(G, set_v, weight)
+
+    # the algorithm for each group
+    for group in C:
+        group = set(group)  # set of nodes in group
+        # initialize the matrices of the sigma and the PB
+        GBC_group = 0
+        sigma_m = deepcopy(sigma)
+        PB_m = deepcopy(PB)
+        sigma_m_v = deepcopy(sigma_m)
+        PB_m_v = deepcopy(PB_m)
+        for v in group:
+            GBC_group += PB_m[v][v]
+            for x in group:
+                for y in group:
+                    dxvy = 0
+                    dxyv = 0
+                    dvxy = 0
+                    if not (
+                        sigma_m[x][y] == 0 or sigma_m[x][v] == 0 or sigma_m[v][y] == 0
+                    ):
+                        if D[x][v] == D[x][y] + D[y][v]:
+                            dxyv = sigma_m[x][y] * sigma_m[y][v] / sigma_m[x][v]
+                        if D[x][y] == D[x][v] + D[v][y]:
+                            dxvy = sigma_m[x][v] * sigma_m[v][y] / sigma_m[x][y]
+                        if D[v][y] == D[v][x] + D[x][y]:
+                            dvxy = sigma_m[v][x] * sigma[x][y] / sigma[v][y]
+                    sigma_m_v[x][y] = sigma_m[x][y] * (1 - dxvy)
+                    PB_m_v[x][y] = PB_m[x][y] - PB_m[x][y] * dxvy
+                    if y != v:
+                        PB_m_v[x][y] -= PB_m[x][v] * dxyv
+                    if x != v:
+                        PB_m_v[x][y] -= PB_m[v][y] * dvxy
+            sigma_m, sigma_m_v = sigma_m_v, sigma_m
+            PB_m, PB_m_v = PB_m_v, PB_m
+
+        # endpoints
+        v, c = len(G), len(group)
+        if not endpoints:
+            scale = 0
+            # if the graph is connected then subtract the endpoints from
+            # the count for all the nodes in the graph. else count how many
+            # nodes are connected to the group's nodes and subtract that.
+            if nx.is_directed(G):
+                if nx.is_strongly_connected(G):
+                    scale = c * (2 * v - c - 1)
+            elif nx.is_connected(G):
+                scale = c * (2 * v - c - 1)
+            if scale == 0:
+                for group_node1 in group:
+                    for node in G:
+                        if node in D[group_node1] and node != group_node1:
+                            if node in group:
+                                scale += 1
+                            else:
+                                scale += 2
+            GBC_group -= scale
+
+        # normalized
+        if normalized:
+            scale = 1 / ((v - c) * (v - c - 1))
+            GBC_group *= scale
+
+        # If undirected than count only the undirected edges
+        elif not G.is_directed():
+            GBC_group /= 2
+
+        GBC.append(GBC_group)
+    if list_of_groups:
+        return GBC
     else:
-        scale = None
-    if scale is not None:
-        betweenness *= scale
-    return betweenness
+        return GBC[0]
+
+
+def _group_preprocessing(G, set_v, weight):
+    sigma = {}
+    delta = {}
+    D = {}
+    betweenness = dict.fromkeys(G, 0)
+    for s in G:
+        if weight is None:  # use BFS
+            S, P, sigma[s], D[s] = _single_source_shortest_path_basic(G, s)
+        else:  # use Dijkstra's algorithm
+            S, P, sigma[s], D[s] = _single_source_dijkstra_path_basic(G, s, weight)
+        betweenness, delta[s] = _accumulate_endpoints(betweenness, S, P, sigma[s], s)
+        for i in delta[s].keys():  # add the paths from s to i and rescale sigma
+            if s != i:
+                delta[s][i] += 1
+            if weight is not None:
+                sigma[s][i] = sigma[s][i] / 2
+    # building the path betweenness matrix only for nodes that appear in the group
+    PB = dict.fromkeys(G)
+    for group_node1 in set_v:
+        PB[group_node1] = dict.fromkeys(G, 0.0)
+        for group_node2 in set_v:
+            if group_node2 not in D[group_node1]:
+                continue
+            for node in G:
+                # if node is connected to the two group nodes than continue
+                if group_node2 in D[node] and group_node1 in D[node]:
+                    if (
+                        D[node][group_node2]
+                        == D[node][group_node1] + D[group_node1][group_node2]
+                    ):
+                        PB[group_node1][group_node2] += (
+                            delta[node][group_node2]
+                            * sigma[node][group_node1]
+                            * sigma[group_node1][group_node2]
+                            / sigma[node][group_node2]
+                        )
+    return PB, sigma, D
 
 
 def group_closeness_centrality(G, S, weight=None):
diff --git a/networkx/algorithms/centrality/percolation.py b/networkx/algorithms/centrality/percolation.py
index 5867b54a..f4414ffc 100644
--- a/networkx/algorithms/centrality/percolation.py
+++ b/networkx/algorithms/centrality/percolation.py
@@ -93,9 +93,9 @@ def percolation_centrality(G, attribute="percolation", states=None, weight=None)
     for s in nodes:
         # single source shortest paths
         if weight is None:  # use BFS
-            S, P, sigma = shortest_path(G, s)
+            S, P, sigma, _ = shortest_path(G, s)
         else:  # use Dijkstra's algorithm
-            S, P, sigma = dijkstra(G, s, weight)
+            S, P, sigma, _ = dijkstra(G, s, weight)
         # accumulation
         percolation = _accumulate_percolation(
             percolation, G, S, P, sigma, s, states, p_sigma_x_t
diff --git a/networkx/algorithms/centrality/tests/test_group.py b/networkx/algorithms/centrality/tests/test_group.py
index 299927a5..a2366021 100644
--- a/networkx/algorithms/centrality/tests/test_group.py
+++ b/networkx/algorithms/centrality/tests/test_group.py
@@ -14,10 +14,24 @@ class TestGroupBetweennessCentrality:
         """
         G = nx.path_graph(5)
         C = [1]
-        b = nx.group_betweenness_centrality(G, C, weight=None, normalized=False)
+        b = nx.group_betweenness_centrality(
+            G, C, weight=None, normalized=False, endpoints=False
+        )
         b_answer = 3.0
         assert b == b_answer
 
+    def test_group_betweenness_with_endpoints(self):
+        """
+        Group betweenness centrality for single node group
+        """
+        G = nx.path_graph(5)
+        C = [1]
+        b = nx.group_betweenness_centrality(
+            G, C, weight=None, normalized=False, endpoints=True
+        )
+        b_answer = 7.0
+        assert b == b_answer
+
     def test_group_betweenness_normalized(self):
         """
         Group betweenness centrality for group with more than
@@ -25,17 +39,29 @@ class TestGroupBetweennessCentrality:
         """
         G = nx.path_graph(5)
         C = [1, 3]
-        b = nx.group_betweenness_centrality(G, C, weight=None, normalized=True)
+        b = nx.group_betweenness_centrality(
+            G, C, weight=None, normalized=True, endpoints=False
+        )
         b_answer = 1.0
         assert b == b_answer
 
+    def test_two_group_betweenness_value_zero(self):
+        """
+        Group betweenness centrality value of 0
+        """
+        G = nx.cycle_graph(7)
+        C = [[0, 1, 6], [0, 1, 5]]
+        b = nx.group_betweenness_centrality(G, C, weight=None, normalized=False)
+        b_answer = [0.0, 3.0]
+        assert b == b_answer
+
     def test_group_betweenness_value_zero(self):
         """
         Group betweenness centrality value of 0
         """
         G = nx.cycle_graph(6)
         C = [0, 1, 5]
-        b = nx.group_betweenness_centrality(G, C, weight=None)
+        b = nx.group_betweenness_centrality(G, C, weight=None, normalized=False)
         b_answer = 0.0
         assert b == b_answer
 
@@ -46,7 +72,7 @@ class TestGroupBetweennessCentrality:
         G = nx.path_graph(5)
         G.remove_edge(0, 1)
         C = [1]
-        b = nx.group_betweenness_centrality(G, C, weight=None)
+        b = nx.group_betweenness_centrality(G, C, weight=None, normalized=False)
         b_answer = 0.0
         assert b == b_answer
 
@@ -55,7 +81,24 @@ class TestGroupBetweennessCentrality:
         Node(s) in C not in graph, raises NodeNotFound exception
         """
         with pytest.raises(nx.NodeNotFound):
-            b = nx.group_betweenness_centrality(nx.path_graph(5), [6, 7, 8])
+            b = nx.group_betweenness_centrality(nx.path_graph(5), [4, 7, 8])
+
+    def test_group_betweenness_directed_weighted(self):
+        """
+        Group betweenness centrality in a directed and weighted graph
+        """
+        G = nx.DiGraph()
+        G.add_edge(1, 0, weight=1)
+        G.add_edge(0, 2, weight=2)
+        G.add_edge(1, 2, weight=3)
+        G.add_edge(3, 1, weight=4)
+        G.add_edge(2, 3, weight=1)
+        G.add_edge(4, 3, weight=6)
+        G.add_edge(2, 4, weight=7)
+        C = [1, 2]
+        b = nx.group_betweenness_centrality(G, C, weight="weight", normalized=False)
+        b_answer = 5.0
+        assert b == b_answer
 
 
 class TestGroupClosenessCentrality:
author	guy rozenberg <guyroznb@gmail.com>	2021-01-19 22:04:34 +0200
committer	GitHub <noreply@github.com>	2021-01-19 15:04:34 -0500
commit	31c70c24d5ab657b02d453fbe3196071d15cd099 (patch)
tree	3343d0f8beefaadc2f12a5cbde33bcd5994c65ce
parent	e5c4b3e382c63ed007b82fb48b3b0d34a8e16c54 (diff)
download	networkx-31c70c24d5ab657b02d453fbe3196071d15cd099.tar.gz