Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Homo Optmization] Add homogeneous graph optimization #683

Merged
merged 22 commits into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 10 additions & 13 deletions python/graphstorm/gconstruct/construct_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -607,18 +607,6 @@ def verify_confs(confs):
"The config file does not have a 'version' entry. Assuming gconstruct-v0.1")
ntypes = {conf['node_type'] for conf in confs["nodes"]}
etypes = [conf['relation'] for conf in confs["edges"]]
# Adjust input to DGL homogeneous graph format if it is a homogeneous graph
if is_homogeneous(confs):
assert etypes[0][0] in ntypes, \
f"source node type {etypes[0][0]} does not exist. Please check your input data."
assert etypes[0][2] in ntypes, \
f"dest node type {etypes[0][2]} does not exist. Please check your input data."
logging.warning("Generated Graph is a homogeneous graph, so the node type will be "
"changed to _N and edge type will be changed to [_N, _E, _N]")
for node in confs['nodes']:
node['node_type'] = DEFAULT_NTYPE
for edge in confs['edges']:
edge['relation'] = list(DEFAULT_ETYPE)
for etype in etypes:
assert len(etype) == 3, \
"The edge type must be (source node type, relation type, dest node type)."
Expand All @@ -627,6 +615,14 @@ def verify_confs(confs):
f"source node type {src_type} does not exist. Please check your input data."
assert dst_type in ntypes, \
f"dest node type {dst_type} does not exist. Please check your input data."
# Adjust input to DGL homogeneous graph format if it is a homogeneous graph
if is_homogeneous(confs):
logging.warning("Generated Graph is a homogeneous graph, so the node type will be "
"changed to _N and edge type will be changed to [_N, _E, _N]")
for node in confs['nodes']:
node['node_type'] = DEFAULT_NTYPE
for edge in confs['edges']:
edge['relation'] = list(DEFAULT_ETYPE)

def print_graph_info(g, node_data, edge_data, node_label_stats, edge_label_stats):
""" Print graph information.
Expand Down Expand Up @@ -745,7 +741,8 @@ def process_graph(args):
if key not in ["train_mask", "test_mask", "val_mask"]:
data[key] = np.concatenate([value, value])
else:
data[key] = np.concatenate([value, [0]*len(value)])
data[key] = np.concatenate([value, np.zeros(value.shape,
dtype=value.dtype)])

else:
for etype in edges:
Expand Down
4 changes: 2 additions & 2 deletions tests/end2end-tests/data_process/homogeneous_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ error_and_exit $?
python3 $GS_HOME/tests/end2end-tests/data_process/check_homogeneous.py
error_and_exit $?

echo "********* Test Node Classification on GConstruct Homogeneous Graph on reverse edge********"
echo "********* Test Node Classification on GConstruct Homogeneous Graph with reverse edge********"
python3 -m graphstorm.run.gs_node_classification --workspace $GS_HOME/training_scripts/gsgnn_np/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /tmp/movielen_100k_train_val_1p_4t_homogeneous_rev/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_nc.yaml --target-ntype _N
error_and_exit $?

echo "********* Test Edge Classification on GConstruct Homogeneous Graph on reverse edge ********"
echo "********* Test Edge Classification on GConstruct Homogeneous Graph with reverse edge ********"
python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /tmp/movielen_100k_train_val_1p_4t_homogeneous_rev/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec.yaml --target-etype _N,_E,_N
error_and_exit $?
3 changes: 1 addition & 2 deletions tests/end2end-tests/data_process/movielens_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@ service ssh restart
GS_HOME=$(pwd)
NUM_TRAINERS=4
export PYTHONPATH=$GS_HOME/python/
cd $GS_HOME/training_scripts/gsgnn_np
echo "127.0.0.1" > ip_list.txt
cd $GS_HOME/training_scripts/gsgnn_ep

echo "127.0.0.1" > ip_list.txt

error_and_exit () {
Expand Down
17 changes: 16 additions & 1 deletion tests/unit-tests/gconstruct/test_construct_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
See the License for the specific language governing permissions and
limitations under the License.
"""
import copy
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two import copy

import random
import os
import tempfile
Expand All @@ -22,11 +23,12 @@
import numpy as np
import dgl
import torch as th
import copy

from functools import partial
from numpy.testing import assert_equal, assert_almost_equal

from graphstorm.gconstruct.construct_graph import parse_edge_data, verify_confs
from graphstorm.gconstruct.construct_graph import parse_edge_data, verify_confs, is_homogeneous
from graphstorm.gconstruct.file_io import write_data_parquet, read_data_parquet
from graphstorm.gconstruct.file_io import write_data_json, read_data_json
from graphstorm.gconstruct.file_io import write_data_csv, read_data_csv
Expand Down Expand Up @@ -1720,9 +1722,16 @@ def test_homogeneous():
"format": {"name": "parquet"}, "files": "/data/ml-100k/edges_homo.parquet", "labels": [
{"label_col": "rate", "task_type": "classification", "split_pct": [0.1, 0.1, 0.1]}]}]
}
assert is_homogeneous(conf)
verify_confs(conf)
assert conf['nodes'][0]["node_type"] == "_N"
assert conf['edges'][0]['relation'] == ["_N", "_E", "_N"]
conf["edges"][0]["relation"] = ["movie_fake", "rating", "movie"]
conf["nodes"].append(copy.deepcopy(conf["nodes"][0]))
conf["nodes"][0]["node_type"] = "movie"
conf["nodes"][1]["node_type"] = "movie_fake"
assert not is_homogeneous(conf)


# multiple node types and edge types input
conf = {
Expand All @@ -1741,9 +1750,15 @@ def test_homogeneous():
{"relation": ["movie", "rating", "movie"], "format": {"name": "parquet"},
"files": "/data/ml-100k/edges_homo.parquet"}]
}
assert is_homogeneous(conf)
verify_confs(conf)
assert conf['nodes'][0]["node_type"] == "_N"
assert conf['edges'][0]['relation'] == ["_N", "_E", "_N"]
conf["edges"][0]["relation"] = ["movie_fake", "rating", "movie"]
conf["nodes"].append(copy.deepcopy(conf["nodes"][0]))
conf["nodes"][0]["node_type"] = "movie"
conf["nodes"][1]["node_type"] = "movie_fake"
assert not is_homogeneous(conf)

if __name__ == '__main__':
test_parse_edge_data()
Expand Down
Loading