-
Notifications
You must be signed in to change notification settings - Fork 310
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix louvain python code to handle isolated vertices #3885
Changes from all commits
38d917a
052e7bc
28d422b
2af9b90
69a036f
3b3c35b
c11e064
cbb414a
0f15b97
8cd7866
a0c4bad
555e632
0dc6faf
4ca2b0c
a7862d8
34cdea1
09f1ac0
1b3205a
7138a55
2252e7b
70c0b5b
a524ee7
fa609dd
3bd3e5a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ | |
# limitations under the License. | ||
|
||
from cugraph.utilities import ( | ||
is_nx_graph_type, | ||
ensure_cugraph_obj_for_nx, | ||
df_score_to_dictionary, | ||
) | ||
|
@@ -21,6 +22,9 @@ | |
from pylibcugraph import louvain as pylibcugraph_louvain | ||
from pylibcugraph import ResourceHandle | ||
|
||
VERTEX_COL_NAME = "vertex" | ||
CLUSTER_ID_COL_NAME = "partition" | ||
|
||
|
||
# FIXME: max_level should default to 100 once max_iter is removed | ||
def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7): | ||
|
@@ -72,9 +76,9 @@ def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7): | |
GPU data frame of size V containing two columns the vertex id and the | ||
partition id it is assigned to. | ||
|
||
df['vertex'] : cudf.Series | ||
result_df['vertex'] : cudf.Series | ||
Contains the vertex identifiers | ||
df['partition'] : cudf.Series | ||
result_df['partition'] : cudf.Series | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not using CLUSTER_ID_COL_NAME ? |
||
Contains the partition assigned to the vertices | ||
|
||
modularity_score : float | ||
|
@@ -89,6 +93,14 @@ def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7): | |
|
||
""" | ||
|
||
isolated_vertices = list() | ||
if is_nx_graph_type(type(G)): | ||
num_of_vertices = len(G.nodes) | ||
isolated_vertices = [v for v in range(num_of_vertices) if G.degree[v] == 0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this seems like a slow process vs putting this code later G, isNx = ensure_cugraph_obj_for_nx(Gin) if (isNx) |
||
else: | ||
# FIXME: Gather list of isolated vertices for other graph types | ||
pass | ||
|
||
G, isNx = ensure_cugraph_obj_for_nx(G) | ||
|
||
if G.is_directed(): | ||
|
@@ -112,6 +124,9 @@ def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7): | |
if max_level is None: | ||
max_level = 100 | ||
|
||
if max_level > 1000: | ||
max_level = 1000 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The docs should say that 1000 is the max level |
||
|
||
vertex, partition, mod_score = pylibcugraph_louvain( | ||
resource_handle=ResourceHandle(), | ||
graph=G._plc_graph, | ||
|
@@ -121,14 +136,27 @@ def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7): | |
do_expensive_check=False, | ||
) | ||
|
||
df = cudf.DataFrame() | ||
df["vertex"] = vertex | ||
df["partition"] = partition | ||
result_df = cudf.DataFrame() | ||
result_df[VERTEX_COL_NAME] = vertex | ||
result_df[CLUSTER_ID_COL_NAME] = partition | ||
|
||
unique_cids = result_df[CLUSTER_ID_COL_NAME].unique() | ||
max_cluster_id = -1 if len(result_df) == 0 else unique_cids.max() | ||
|
||
if len(isolated_vertices) > 0: | ||
isolated_vtx_and_cids = cudf.DataFrame() | ||
isolated_vtx_and_cids[VERTEX_COL_NAME] = isolated_vertices | ||
isolated_vtx_and_cids[CLUSTER_ID_COL_NAME] = [ | ||
(max_cluster_id + i + 1) for i in range(len(isolated_vertices)) | ||
] | ||
result_df = cudf.concat( | ||
[result_df, isolated_vtx_and_cids], ignore_index=True, sort=False | ||
) | ||
|
||
if G.renumbered: | ||
df = G.unrenumber(df, "vertex") | ||
result_df = G.unrenumber(result_df, VERTEX_COL_NAME) | ||
|
||
if isNx is True: | ||
df = df_score_to_dictionary(df, "partition") | ||
result_df = df_score_to_dictionary(result_df, CLUSTER_ID_COL_NAME) | ||
|
||
return df, mod_score | ||
return result_df, mod_score |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
not using VERTEX_COL_NAME ?