Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Investigate pyarrow version pinning (cudf.utils.nvtx_annotation issue) #3950

Closed
wants to merge 22 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
5c37396
Added benchmark datasets yaml files for use with Dataset
betochimas Sep 5, 2023
93fb3e7
Introduce way to download larger datasets
betochimas Sep 7, 2023
b236d85
Merge branch 'rapidsai:branch-23.10' into branch-23.10-large-datasets
betochimas Sep 7, 2023
e94baa3
Testing gpg verification
betochimas Sep 7, 2023
aafd5d0
Use create_using pattern to give user choice for dataframe lib
betochimas Sep 7, 2023
23edacf
Merge branch 'branch-23.10' into branch-23.10-large-datasets
betochimas Sep 7, 2023
a4ed8ef
Insert benchmarking Dataset instances
betochimas Sep 8, 2023
ddb8c35
Merge branch 'branch-23.10' into branch-23.10-large-datasets
betochimas Sep 8, 2023
31ca5ea
Changes to testing and fixing urls
betochimas Sep 19, 2023
8e0cfac
Testing changes to pass CI, added and removed comments
betochimas Sep 20, 2023
d15f601
Merge branch 'branch-23.10' into branch-23.10-large-datasets
betochimas Sep 20, 2023
045597a
Merge branch 'branch-23.10' into branch-23.10-large-datasets
betochimas Sep 22, 2023
a85cb7e
Merge branch 'branch-23.10' into branch-23.10-large-datasets
betochimas Sep 22, 2023
f816f78
Add FIXMEs for CI failure points
betochimas Sep 22, 2023
12c3d66
Merge branch 'branch-23.10-large-datasets' of https://github.com/beto…
nv-rliu Oct 9, 2023
7c0db5d
Merge branch 'rapidsai:branch-23.12' into branch-23.12-large-datasets
nv-rliu Oct 16, 2023
abda461
Merge branch 'rapidsai:branch-23.12' into branch-23.12-large-datasets
nv-rliu Oct 17, 2023
3fb602b
update large dataset work. primarily unit tests
nv-rliu Oct 17, 2023
858e155
Merge branch 'branch-23.12-large-datasets' of github.com:nv-rliu/cugr…
nv-rliu Oct 17, 2023
875011d
remove twitter
nv-rliu Oct 17, 2023
424bc51
fix bug in test fixture for unweighted graphs
nv-rliu Oct 18, 2023
27c8d89
Merge branch 'branch-23.12' of github.com:nv-rliu/cugraph into branch…
nv-rliu Oct 23, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions datasets/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,13 @@ The benchmark datasets are described below:
| soc-twitter-2010 | 21,297,772 | 265,025,809 | No | No |

**cit-Patents** : A citation graph that includes all citations made by patents granted between 1975 and 1999, totaling 16,522,438 citations.

**soc-LiveJournal** : A graph of the LiveJournal social network.

**europe_osm** : A graph of OpenStreetMap data for Europe.

**hollywood** : A graph of movie actors where vertices are actors, and two actors are joined by an edge whenever they appeared in a movie together.

**soc-twitter-2010** : A network of follower relationships from a snapshot of Twitter in 2010, where an edge from i to j indicates that j is a follower of i.

_NOTE: the benchmark datasets were converted to a CSV format from their original format described in the reference URL below, and in doing so had edge weights and isolated vertices discarded._
Expand Down
10 changes: 10 additions & 0 deletions python/cugraph/cugraph/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,13 @@
small_tree = Dataset(meta_path / "small_tree.yaml")
toy_graph = Dataset(meta_path / "toy_graph.yaml")
toy_graph_undirected = Dataset(meta_path / "toy_graph_undirected.yaml")

# Benchmarking datasets: be mindful of memory usage
# 250 MB
soc_livejournal = Dataset(meta_path / "soc-livejournal1.yaml")
# 965 MB
cit_patents = Dataset(meta_path / "cit-patents.yaml")
# 1.8 GB
europe_osm = Dataset(meta_path / "europe_osm.yaml")
# 1.5 GB
hollywood = Dataset(meta_path / "hollywood.yaml")
31 changes: 23 additions & 8 deletions python/cugraph/cugraph/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def unload(self):
"""
self._edgelist = None

def get_edgelist(self, download=False):
def get_edgelist(self, download=False, create_using=cudf):
"""
Return an Edgelist

Expand All @@ -168,6 +168,10 @@ def get_edgelist(self, download=False):
download : Boolean (default=False)
Automatically download the dataset from the 'url' location within
the YAML file.

create_using : module (default=cudf)
Specify which module to use when reading the dataset. This module
must have a read_csv function.
"""
if self._edgelist is None:
full_path = self.get_path()
Expand All @@ -183,7 +187,15 @@ def get_edgelist(self, download=False):
header = None
if isinstance(self.metadata["header"], int):
header = self.metadata["header"]
self._edgelist = cudf.read_csv(
if create_using is None:
reader = cudf
elif str(type(create_using)) != "<class 'module'>":
raise RuntimeError("create_using must be a module.")
elif create_using.__name__ == "cudf" or "pandas":
reader = create_using
else:
raise NotImplementedError()
self._edgelist = reader.read_csv(
full_path,
delimiter=self.metadata["delim"],
names=self.metadata["col_names"],
Expand Down Expand Up @@ -219,6 +231,10 @@ def get_graph(
dataset -if present- will be applied to the Graph. If the
dataset does not contain weights, the Graph returned will
be unweighted regardless of ignore_weights.

store_transposed: Boolean (default=False)
If True, stores the transpose of the adjacency matrix. Required
for certain algorithms, such as pagerank.
"""
if self._edgelist is None:
self.get_edgelist(download)
Expand All @@ -237,20 +253,19 @@ def get_graph(
"(or subclass) type or instance, got: "
f"{type(create_using)}"
)

if len(self.metadata["col_names"]) > 2 and not (ignore_weights):
G.from_cudf_edgelist(
self._edgelist,
source="src",
destination="dst",
edge_attr="wgt",
source=self.metadata["col_names"][0],
destination=self.metadata["col_names"][1],
edge_attr=self.metadata["col_names"][2],
store_transposed=store_transposed,
)
else:
G.from_cudf_edgelist(
self._edgelist,
source="src",
destination="dst",
source=self.metadata["col_names"][0],
destination=self.metadata["col_names"][1],
store_transposed=store_transposed,
)
return G
Expand Down
22 changes: 22 additions & 0 deletions python/cugraph/cugraph/datasets/metadata/cit-patents.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: cit-Patents
file_type: .csv
description: A citation graph that includes all citations made by patents granted between 1975 and 1999, totaling 16,522,438 citations.
author: NBER
refs:
J. Leskovec, J. Kleinberg and C. Faloutsos. Graphs over Time Densification Laws, Shrinking Diameters and Possible Explanations.
ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 2005.
delim: " "
header: None
col_names:
- src
- dst
col_types:
- int32
- int32
has_loop: true
is_directed: true
is_multigraph: false
is_symmetric: false
number_of_edges: 16518948
number_of_nodes: 3774768
url: https://data.rapids.ai/cugraph/datasets/cit-Patents.csv
21 changes: 21 additions & 0 deletions python/cugraph/cugraph/datasets/metadata/europe_osm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: europe_osm
file_type: .csv
description: A graph of OpenStreetMap data for Europe.
author: M. Kobitzsh / Geofabrik GmbH
refs:
Rossi, Ryan. Ahmed, Nesreen. The Network Data Respoistory with Interactive Graph Analytics and Visualization.
delim: " "
header: None
col_names:
- src
- dst
col_types:
- int32
- int32
has_loop: false
is_directed: false
is_multigraph: false
is_symmetric: true
number_of_edges: 54054660
number_of_nodes: 50912018
url: https://data.rapids.ai/cugraph/datasets/europe_osm.csv
26 changes: 26 additions & 0 deletions python/cugraph/cugraph/datasets/metadata/hollywood.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: hollywood
file_type: .csv
description:
A graph of movie actors where vertices are actors, and two actors are
joined by an edge whenever they appeared in a movie together.
author: Laboratory for Web Algorithmics (LAW)
refs:
The WebGraph Framework I Compression Techniques, Paolo Boldi
and Sebastiano Vigna, Proc. of the Thirteenth International
World Wide Web Conference (WWW 2004), 2004, Manhattan, USA,
pp. 595--601, ACM Press.
delim: " "
header: None
col_names:
- src
- dst
col_types:
- int32
- int32
has_loop: false
is_directed: false
is_multigraph: false
is_symmetric: true
number_of_edges: 57515616
number_of_nodes: 1139905
url: https://data.rapids.ai/cugraph/datasets/hollywood.csv
22 changes: 22 additions & 0 deletions python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: soc-LiveJournal1
file_type: .csv
description: A graph of the LiveJournal social network.
author: L. Backstrom, D. Huttenlocher, J. Kleinberg, X. Lan
refs:
L. Backstrom, D. Huttenlocher, J. Kleinberg, X. Lan. Group Formation in
Large Social Networks Membership, Growth, and Evolution. KDD, 2006.
delim: " "
header: None
col_names:
- src
- dst
col_types:
- int32
- int32
has_loop: true
is_directed: true
is_multigraph: false
is_symmetric: false
number_of_edges: 68993773
number_of_nodes: 4847571
url: https://data.rapids.ai/cugraph/datasets/soc-LiveJournal1.csv
22 changes: 22 additions & 0 deletions python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: soc-twitter-2010
file_type: .csv
description: A network of follower relationships from a snapshot of Twitter in 2010, where an edge from i to j indicates that j is a follower of i.
author: H. Kwak, C. Lee, H. Park, S. Moon
refs:
J. Yang, J. Leskovec. Temporal Variation in Online Media. ACM Intl.
Conf. on Web Search and Data Mining (WSDM '11), 2011.
delim: " "
header: None
col_names:
- src
- dst
col_types:
- int32
- int32
has_loop: false
is_directed: false
is_multigraph: false
is_symmetric: false
number_of_edges: 530051354
number_of_nodes: 21297772
url: https://data.rapids.ai/cugraph/datasets/soc-twitter-2010.csv
6 changes: 6 additions & 0 deletions python/cugraph/cugraph/testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@
email_Eu_core,
toy_graph,
toy_graph_undirected,
soc_livejournal,
cit_patents,
europe_osm,
hollywood,
# twitter,
)

#
Expand Down Expand Up @@ -66,3 +71,4 @@
toy_graph_undirected,
]
DEFAULT_DATASETS = [dolphins, netscience, karate_disjoint]
BENCHMARKING_DATASETS = [soc_livejournal, cit_patents, europe_osm, hollywood]
40 changes: 33 additions & 7 deletions python/cugraph/cugraph/tests/utils/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
ALL_DATASETS,
WEIGHTED_DATASETS,
SMALL_DATASETS,
BENCHMARKING_DATASETS,
)
from cugraph import datasets

Expand Down Expand Up @@ -94,7 +95,10 @@ def setup_deprecation_warning_tests():
# Helpers

# check if there is a row where src == dst
def has_loop(df):
def has_selfloop(dataset):
if not dataset.metadata["is_directed"]:
return False
df = dataset.get_edgelist(download=True)
df.rename(columns={df.columns[0]: "src", df.columns[1]: "dst"}, inplace=True)
res = df.where(df["src"] == df["dst"])

Expand All @@ -109,7 +113,13 @@ def is_symmetric(dataset):
else:
df = dataset.get_edgelist(download=True)
df_a = df.sort_values("src")
df_b = df_a[["dst", "src", "wgt"]]

# create df with swapped src/dst columns
df_b = None
if "wgt" in df_a.columns:
df_b = df_a[["dst", "src", "wgt"]]
else:
df_b = df_a[["dst", "src"]]
df_b.rename(columns={"dst": "src", "src": "dst"}, inplace=True)
# created a df by appending the two
res = cudf.concat([df_a, df_b])
Expand Down Expand Up @@ -172,7 +182,6 @@ def test_get_graph(dataset):
@pytest.mark.parametrize("dataset", ALL_DATASETS)
def test_metadata(dataset):
M = dataset.metadata

assert M is not None


Expand Down Expand Up @@ -310,10 +319,8 @@ def test_is_directed(dataset):


@pytest.mark.parametrize("dataset", ALL_DATASETS)
def test_has_loop(dataset):
df = dataset.get_edgelist(download=True)

assert has_loop(df) == dataset.metadata["has_loop"]
def test_has_selfloop(dataset):
assert has_selfloop(dataset) == dataset.metadata["has_loop"]


@pytest.mark.parametrize("dataset", ALL_DATASETS)
Expand All @@ -328,6 +335,25 @@ def test_is_multigraph(dataset):
assert G.is_multigraph() == dataset.metadata["is_multigraph"]


# The datasets used for benchmarks are in their own test, since downloading them
# repeatedly would increase testing overhead significantly
@pytest.mark.parametrize("dataset", BENCHMARKING_DATASETS)
def test_benchmarking_datasets(dataset):
dataset_is_directed = dataset.metadata["is_directed"]
G = dataset.get_graph(
download=True, create_using=Graph(directed=dataset_is_directed)
)

assert G.is_directed() == dataset.metadata["is_directed"]
assert G.number_of_nodes() == dataset.metadata["number_of_nodes"]
assert G.number_of_edges() == dataset.metadata["number_of_edges"]
assert has_selfloop(dataset) == dataset.metadata["has_loop"]
assert is_symmetric(dataset) == dataset.metadata["is_symmetric"]
assert G.is_multigraph() == dataset.metadata["is_multigraph"]

dataset.unload()


@pytest.mark.parametrize("dataset", ALL_DATASETS)
def test_object_getters(dataset):
assert dataset.is_directed() == dataset.metadata["is_directed"]
Expand Down
Loading