Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Homo Optmization] Add homogeneous graph optimization #683

Merged
merged 22 commits into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions python/graphstorm/gconstruct/construct_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,15 +582,32 @@ def process_edge_data(process_confs, node_id_map, arr_merger,

return (edges, edge_data, label_stats)

def verify_confs(confs):
def verify_confs(confs, rev_edges):
jalencato marked this conversation as resolved.
Show resolved Hide resolved
""" Verify the configuration of the input data.
Parameters
----------
rev_edges: bool
Whether to add reverse edges
"""
if "version" not in confs:
# TODO: Make a requirement with v1.0 launch
logging.warning(
"The config file does not have a 'version' entry. Assuming gconstruct-v0.1")
ntypes = {conf['node_type'] for conf in confs["nodes"]}
etypes = [conf['relation'] for conf in confs["edges"]]
# Adjust input to DGL homogeneous graph format if it is a homogeneous graph
etype_set = set(tuple(relation) for relation in etypes)
if len(ntypes) == 1 and len(etype_set) == 1 and not rev_edges:
assert etypes[0][0] in ntypes, \
f"source node type {etypes[0][0]} does not exist. Please check your input data."
assert etypes[0][2] in ntypes, \
f"dest node type {etypes[0][2]} does not exist. Please check your input data."
logging.warning("Generated Graph is a homogeneous graph, so the node type will be "
"changed to _N and edge type will be changed to [_N, _E, _N]")
for node in confs['nodes']:
node['node_type'] = "_N"
for edge in confs['edges']:
edge['relation'] = ["_N", "_E", "_N"]
for etype in etypes:
assert len(etype) == 3, \
"The edge type must be (source node type, relation type, dest node type)."
Expand Down Expand Up @@ -668,7 +685,7 @@ def process_graph(args):
if args.num_processes_for_nodes is not None else args.num_processes
num_processes_for_edges = args.num_processes_for_edges \
if args.num_processes_for_edges is not None else args.num_processes
verify_confs(process_confs)
verify_confs(process_confs, args.add_reverse_edges)
jalencato marked this conversation as resolved.
Show resolved Hide resolved
output_format = args.output_format
for out_format in output_format:
assert out_format in ["DGL", "DistDGL"], \
Expand Down
59 changes: 59 additions & 0 deletions tests/end2end-tests/data_gen/movielens_homo.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
{
jalencato marked this conversation as resolved.
Show resolved Hide resolved
"version": "gconstruct-v0.1",
"nodes": [
{
"node_id_col": "id",
"node_type": "movie",
"format": {"name": "parquet"},
"files": "/data/ml-100k/movie.parquet",
"features": [
{
"feature_col": "title",
"transform": {
"name": "bert_hf",
"bert_model": "bert-base-uncased",
"max_seq_length": 16
}
}
],
"labels": [
{
"label_col": "label",
"task_type": "classification",
"split_pct": [0.8, 0.1, 0.1]
}
]
},
{
"node_type": "movie",
"format": {"name": "parquet"},
"files": "/data/ml-100k/movie.parquet",
"features": [
{
"feature_col": "id"
}
]
}
],
"edges": [
{
"source_id_col": "src_id",
"dest_id_col": "dst_id",
"relation": ["movie", "rating", "movie"],
"format": {"name": "parquet"},
"files": "/data/ml-100k/edges_homo.parquet",
"labels": [
{
"label_col": "rate",
"task_type": "classification",
"split_pct": [0.1, 0.1, 0.1]
}
]
},
{
"relation": ["movie", "rating", "movie"],
"format": {"name": "parquet"},
"files": "/data/ml-100k/edges_homo.parquet"
}
]
}
5 changes: 5 additions & 0 deletions tests/end2end-tests/data_gen/process_movielens.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ def write_data_parquet(data, data_file):
edge_data = {'src_id': edges[0], 'dst_id': edges[1], 'rate': edges[2]}
write_data_parquet(edge_data, '/data/ml-100k/edges.parquet')

# generate data for homogeneous optimization test
edges = pandas.read_csv('/data/ml-100k/u.data', delimiter='\t', header=None)
edge_data = {'src_id': edges[1], 'dst_id': edges[1], 'rate': edges[2]}
write_data_parquet(edge_data, '/data/ml-100k/edges_homo.parquet')
jalencato marked this conversation as resolved.
Show resolved Hide resolved

# generate synthetic user data with label
user_labels = np.random.randint(11, size=feat.shape[0])
user_data = {'id': user['id'].values, 'feat': feat, 'occupation': user['occupation'], 'label': user_labels}
Expand Down
10 changes: 9 additions & 1 deletion tests/end2end-tests/data_process/movielens_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@ service ssh restart
GS_HOME=$(pwd)
NUM_TRAINERS=4
export PYTHONPATH=$GS_HOME/python/
cd $GS_HOME/training_scripts/gsgnn_np
echo "127.0.0.1" > ip_list.txt
jalencato marked this conversation as resolved.
Show resolved Hide resolved
cd $GS_HOME/training_scripts/gsgnn_ep

echo "127.0.0.1" > ip_list.txt

error_and_exit () {
Expand All @@ -27,6 +28,13 @@ python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2

error_and_exit $?

echo "********* Test Homogeneous Graph Optimization ********"
jalencato marked this conversation as resolved.
Show resolved Hide resolved
python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2end-tests/data_gen/movielens_homo.json --num-processes 1 --output-dir /tmp/movielen_100k_train_val_1p_4t_homo --graph-name movie-lens-100k
error_and_exit $?

python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /tmp/movielen_100k_train_val_1p_4t_homo/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --target-ntype _N
error_and_exit $?

echo "********* Test the DistDGL graph format with BERT embeddings ********"
python3 -m graphstorm.gconstruct.construct_graph --conf-file $GS_HOME/tests/end2end-tests/data_gen/movielens.json --num-processes 1 --output-dir /tmp/movielens_bert_emb --graph-name ml --add-reverse-edges

Expand Down
57 changes: 55 additions & 2 deletions tests/unit-tests/gconstruct/test_construct_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from functools import partial
from numpy.testing import assert_equal, assert_almost_equal

from graphstorm.gconstruct.construct_graph import parse_edge_data
from graphstorm.gconstruct.construct_graph import parse_edge_data, verify_confs
from graphstorm.gconstruct.file_io import write_data_parquet, read_data_parquet
from graphstorm.gconstruct.file_io import write_data_json, read_data_json
from graphstorm.gconstruct.file_io import write_data_csv, read_data_csv
Expand Down Expand Up @@ -1705,6 +1705,58 @@ def test_gc():
assert not os.path.isdir("/tmp_featurewrapper2"), \
"Directory /tmp_featurewrapper2 should not exist after gc"


def test_homo():
jalencato marked this conversation as resolved.
Show resolved Hide resolved
# single node type and edge type input
jalencato marked this conversation as resolved.
Show resolved Hide resolved
conf = {
"version": "gconstruct-v0.1", "nodes": [
{"node_id_col": "id", "node_type": "movie", "format": {"name": "parquet"},
"files": "/data/ml-100k/movie.parquet", "features": [
{"feature_col": "title", "transform": {
"name": "bert_hf", "bert_model": "bert-base-uncased", "max_seq_length": 16}}],
"labels": [{"label_col": "label", "task_type": "classification", "split_pct": [0.8, 0.1, 0.1]}]}],
"edges": [
{"source_id_col": "src_id", "dest_id_col": "dst_id", "relation": ["movie", "rating", "movie"],
"format": {"name": "parquet"}, "files": "/data/ml-100k/edges_homo.parquet", "labels": [
{"label_col": "rate", "task_type": "classification", "split_pct": [0.1, 0.1, 0.1]}]}]
}
verify_confs(conf, rev_edges=False)
assert conf['nodes'][0]["node_type"] == "_N"
assert conf['edges'][0]['relation'] == ["_N", "_E", "_N"]

conf['nodes'][0]["node_type"] = "movie"
conf['edges'][0]['relation'] = ['movie', 'rating', 'movie']
verify_confs(conf, rev_edges=True)
assert conf['nodes'][0]["node_type"] == "movie"
assert conf['edges'][0]['relation'] == ["movie", "rating", "movie"]

# multiple node types and edge types input
conf = {
"version": "gconstruct-v0.1", "nodes": [
{"node_id_col": "id", "node_type": "movie", "format": {"name": "parquet"},
"files": "/data/ml-100k/movie.parquet", "features": [
{"feature_col": "title", "transform": {
"name": "bert_hf", "bert_model": "bert-base-uncased", "max_seq_length": 16}}],
"labels": [{"label_col": "label", "task_type": "classification", "split_pct": [0.8, 0.1, 0.1]}]},
{"node_type": "movie", "format": {"name": "parquet"}, "files": "/data/ml-100k/movie.parquet",
"features": [{"feature_col": "id"}]}],
"edges": [
{"source_id_col": "src_id", "dest_id_col": "dst_id", "relation": ["movie", "rating", "movie"],
"format": {"name": "parquet"}, "files": "/data/ml-100k/edges_homo.parquet", "labels": [
{"label_col": "rate", "task_type": "classification", "split_pct": [0.1, 0.1, 0.1]}]},
{"relation": ["movie", "rating", "movie"], "format": {"name": "parquet"},
"files": "/data/ml-100k/edges_homo.parquet"}]
}
verify_confs(conf, rev_edges=False)
assert conf['nodes'][0]["node_type"] == "_N"
assert conf['edges'][0]['relation'] == ["_N", "_E", "_N"]

conf['nodes'][0]["node_type"] = "movie"
conf['edges'][0]['relation'] = ['movie', 'rating', 'movie']
verify_confs(conf, rev_edges=True)
assert conf['nodes'][0]["node_type"] == "movie"
assert conf['edges'][0]['relation'] == ["movie", "rating", "movie"]

if __name__ == '__main__':
test_parse_edge_data()
test_multiprocessing_checks()
Expand All @@ -1723,4 +1775,5 @@ def test_gc():
test_label()
test_multicolumn(None)
test_multicolumn("/")
test_feature_wrapper()
test_feature_wrapper()
test_homo()
Loading