From 2e2a463cdb202780949baa6961692e548209fea1 Mon Sep 17 00:00:00 2001 From: Ralph Liu Date: Tue, 26 Nov 2024 22:42:42 -0800 Subject: [PATCH 1/5] Add support for Amazon0302 benchmarking dataset --- python/cugraph/cugraph/datasets/__init__.py | 3 ++- .../cugraph/datasets/metadata/amazon0302.yaml | 21 +++++++++++++++++++ python/cugraph/cugraph/testing/__init__.py | 11 ++++++++-- 3 files changed, 32 insertions(+), 3 deletions(-) create mode 100644 python/cugraph/cugraph/datasets/metadata/amazon0302.yaml diff --git a/python/cugraph/cugraph/datasets/__init__.py b/python/cugraph/cugraph/datasets/__init__.py index ac18274d354..ecf10f3c4ef 100644 --- a/python/cugraph/cugraph/datasets/__init__.py +++ b/python/cugraph/cugraph/datasets/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -49,3 +49,4 @@ europe_osm = Dataset(meta_path / "europe_osm.yaml") # 1.5 GB hollywood = Dataset(meta_path / "hollywood.yaml") +amazon0302 = Dataset(meta_path / "amazon0302.yaml") diff --git a/python/cugraph/cugraph/datasets/metadata/amazon0302.yaml b/python/cugraph/cugraph/datasets/metadata/amazon0302.yaml new file mode 100644 index 00000000000..2daeae37461 --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/amazon0302.yaml @@ -0,0 +1,21 @@ +name: amazon0302 +file_type: .gz +description: + Network was collected by crawling Amazon website. It is based on Customers Who Bought This Item Also Bought feature of the Amazon website. If a product i is frequently co-purchased with product j, the graph contains a directed edge from i to j. The data was collected in March 02 2003. +author: J. Leskovec, L. Adamic and B. Adamic +refs: J. Leskovec, L. Adamic and B. Adamic. The Dynamics of Viral Marketing. ACM Transactions on the Web (ACM TWEB), 1(1), 2007. +delim: "\t" +header: 3 +col_names: + - FromNodeId + - ToNodeId +col_types: + - int32 + - int32 +has_loop: false +is_directed: true +is_multigraph: false +is_symmetric: false +number_of_edges: 1234877 +number_of_nodes: 262111 +url: https://snap.stanford.edu/data/amazon0302.txt.gz diff --git a/python/cugraph/cugraph/testing/__init__.py b/python/cugraph/cugraph/testing/__init__.py index 2b4a4fd3ebf..5c89159bcff 100644 --- a/python/cugraph/cugraph/testing/__init__.py +++ b/python/cugraph/cugraph/testing/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -38,6 +38,7 @@ cit_patents, europe_osm, hollywood, + amazon0302, # twitter, ) @@ -71,4 +72,10 @@ toy_graph_undirected, ] DEFAULT_DATASETS = [dolphins, netscience, karate_disjoint] -BENCHMARKING_DATASETS = [soc_livejournal, cit_patents, europe_osm, hollywood] +BENCHMARKING_DATASETS = [ + soc_livejournal, + cit_patents, + europe_osm, + hollywood, + amazon0302, +] From c280f081764bf37f8300020b9329497a477deb66 Mon Sep 17 00:00:00 2001 From: Ralph Liu <137829296+nv-rliu@users.noreply.github.com> Date: Tue, 10 Dec 2024 10:33:23 +0900 Subject: [PATCH 2/5] Update python/cugraph/cugraph/datasets/metadata/amazon0302.yaml Co-authored-by: Rick Ratzel <3039903+rlratzel@users.noreply.github.com> --- python/cugraph/cugraph/datasets/metadata/amazon0302.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/datasets/metadata/amazon0302.yaml b/python/cugraph/cugraph/datasets/metadata/amazon0302.yaml index 2daeae37461..7376f109318 100644 --- a/python/cugraph/cugraph/datasets/metadata/amazon0302.yaml +++ b/python/cugraph/cugraph/datasets/metadata/amazon0302.yaml @@ -1,7 +1,7 @@ name: amazon0302 file_type: .gz description: - Network was collected by crawling Amazon website. It is based on Customers Who Bought This Item Also Bought feature of the Amazon website. If a product i is frequently co-purchased with product j, the graph contains a directed edge from i to j. The data was collected in March 02 2003. +This network was collected by crawling the Amazon website. It is based on the "Customers Who Bought This Item Also Bought" feature of the Amazon website. If product i is frequently co-purchased with product j, the graph contains a directed edge from i to j. The data was collected in March 02 2003. author: J. Leskovec, L. Adamic and B. Adamic refs: J. Leskovec, L. Adamic and B. Adamic. The Dynamics of Viral Marketing. ACM Transactions on the Web (ACM TWEB), 1(1), 2007. delim: "\t" From 940695b6fc55bd775f6d37cabebd355d542cb80c Mon Sep 17 00:00:00 2001 From: Ralph Liu Date: Tue, 10 Dec 2024 23:12:18 -0800 Subject: [PATCH 3/5] Fix YAML formatting --- python/cugraph/cugraph/datasets/metadata/amazon0302.yaml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/cugraph/cugraph/datasets/metadata/amazon0302.yaml b/python/cugraph/cugraph/datasets/metadata/amazon0302.yaml index 7376f109318..b02c936a06e 100644 --- a/python/cugraph/cugraph/datasets/metadata/amazon0302.yaml +++ b/python/cugraph/cugraph/datasets/metadata/amazon0302.yaml @@ -1,9 +1,14 @@ name: amazon0302 file_type: .gz description: -This network was collected by crawling the Amazon website. It is based on the "Customers Who Bought This Item Also Bought" feature of the Amazon website. If product i is frequently co-purchased with product j, the graph contains a directed edge from i to j. The data was collected in March 02 2003. + This network was collected by crawling the Amazon website. It is based on the + "Customers Who Bought This Item Also Bought" feature of the Amazon website. + If product i is frequently co-purchased with product j, the graph contains a + directed edge from i to j. The data was collected in March 02 2003. author: J. Leskovec, L. Adamic and B. Adamic -refs: J. Leskovec, L. Adamic and B. Adamic. The Dynamics of Viral Marketing. ACM Transactions on the Web (ACM TWEB), 1(1), 2007. +refs: + J. Leskovec, L. Adamic and B. Adamic. The Dynamics of Viral Marketing. + ACM Transactions on the Web (ACM TWEB), 1(1), 2007. delim: "\t" header: 3 col_names: From 383d81b91e6a1c3a9085ad205805ed8ce70dba22 Mon Sep 17 00:00:00 2001 From: Ralph Liu Date: Wed, 11 Dec 2024 13:28:56 -0800 Subject: [PATCH 4/5] Add automatic column renaming to is_symmetric() --- python/cugraph/cugraph/tests/utils/test_dataset.py | 3 +++ python/cugraph/cugraph/utilities/utils.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/tests/utils/test_dataset.py b/python/cugraph/cugraph/tests/utils/test_dataset.py index 3873cd1c3e4..7b1c43e71b8 100644 --- a/python/cugraph/cugraph/tests/utils/test_dataset.py +++ b/python/cugraph/cugraph/tests/utils/test_dataset.py @@ -104,6 +104,7 @@ def is_symmetric(dataset): return True else: df = dataset.get_edgelist(download=True) + df.rename(columns={df.columns[0]: "src", df.columns[1]: "dst"}, inplace=True) df_a = df.sort_values("src") # create df with swapped src/dst columns @@ -443,6 +444,8 @@ def test_benchmarking_datasets(dataset): download=True, create_using=Graph(directed=dataset_is_directed) ) + breakpoint() + assert G.is_directed() == dataset.metadata["is_directed"] assert G.number_of_nodes() == dataset.metadata["number_of_nodes"] assert G.number_of_edges() == dataset.metadata["number_of_edges"] diff --git a/python/cugraph/cugraph/utilities/utils.py b/python/cugraph/cugraph/utilities/utils.py index 5bad68a79e2..0257da4ffc0 100644 --- a/python/cugraph/cugraph/utilities/utils.py +++ b/python/cugraph/cugraph/utilities/utils.py @@ -528,7 +528,7 @@ def create_list_series_from_2d_ar(ar, index): cp.arange(start=0, stop=len(data) + 1, step=n_cols), dtype="int32" ) mask_col = cp.full(shape=n_rows, fill_value=True) - mask = cudf._lib.transform.bools_to_mask(as_column(mask_col)) + mask = as_column(mask_col).as_mask() lc = cudf.core.column.ListColumn( data=None, size=n_rows, From b6a8a7bbc26bc1bb84359ea6eb7f363b6e90a3dc Mon Sep 17 00:00:00 2001 From: Ralph Liu Date: Wed, 11 Dec 2024 13:31:32 -0800 Subject: [PATCH 5/5] Remove break --- python/cugraph/cugraph/tests/utils/test_dataset.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cugraph/cugraph/tests/utils/test_dataset.py b/python/cugraph/cugraph/tests/utils/test_dataset.py index 7b1c43e71b8..9895eb61c82 100644 --- a/python/cugraph/cugraph/tests/utils/test_dataset.py +++ b/python/cugraph/cugraph/tests/utils/test_dataset.py @@ -444,8 +444,6 @@ def test_benchmarking_datasets(dataset): download=True, create_using=Graph(directed=dataset_is_directed) ) - breakpoint() - assert G.is_directed() == dataset.metadata["is_directed"] assert G.number_of_nodes() == dataset.metadata["number_of_nodes"] assert G.number_of_edges() == dataset.metadata["number_of_edges"]