Skip to content

Commit

Permalink
Update GSgnnData for examples (#805)
Browse files Browse the repository at this point in the history
*Issue #, if available:*
#755 #756 

*Description of changes:*
Update examples:
 - [x] HGT
 - [x] GPeft
 - [x] TGAT

By submitting this pull request, I confirm that you can use, modify,
copy, and redistribute this contribution, under the terms of your
choice.

---------

Co-authored-by: Xiang Song <[email protected]>
  • Loading branch information
classicsong and Xiang Song authored Apr 28, 2024
1 parent 9a3242f commit 6ccfa64
Show file tree
Hide file tree
Showing 19 changed files with 106 additions and 88 deletions.
5 changes: 1 addition & 4 deletions docs/source/api/graphstorm.dataloading.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,7 @@ DataSets
:nosignatures:
:template: datasettemplate.rst

GSgnnNodeTrainData
GSgnnNodeInferData
GSgnnEdgeTrainData
GSgnnEdgeInferData
GSgnnData

DataLoaders
------------
Expand Down
2 changes: 1 addition & 1 deletion examples/customized_models/HGT/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ In order to plus users' own GNN models into the GraphStorm Framework, users need
- Define your own loss function, or use GraphStorm's built-in loss functions that can handel common classification, regression, and link predictioin tasks.
- In case having unused weights problem, modify the loss computation to include a regulation computation of all parameters

3. Use the GraphStorm's dataset, e.g., [GSgnnNodeTrainData](https://github.com/awslabs/graphstorm/blob/main/python/graphstorm/dataloading/dataset.py#L469) and dataloader, e.g., [GSgnnNodeDataLoader](https://github.com/awslabs/graphstorm/blob/main/python/graphstorm/dataloading/dataloading.py#L544) to construct distributed graph loading and mini-batch sampling.
3. Use the GraphStorm's dataset, e.g., [GSgnnData](https://github.com/awslabs/graphstorm/blob/main/python/graphstorm/dataloading/dataset.py#L157) and dataloader, e.g., [GSgnnNodeDataLoader](https://github.com/awslabs/graphstorm/blob/main/python/graphstorm/dataloading/dataloading.py#L544) to construct distributed graph loading and mini-batch sampling.

4. Wrap your model in a GraphStorm trainer, e.g., [GSgnnNodePredictionTrainer](https://github.com/awslabs/graphstorm/blob/main/python/graphstorm/trainer/np_trainer.py), which will handle the training process with its fit() method.

Expand Down
41 changes: 24 additions & 17 deletions examples/customized_models/HGT/hgt_nc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from graphstorm import model as gsmodel
from graphstorm.trainer import GSgnnNodePredictionTrainer
from graphstorm.inference import GSgnnNodePredictionInferrer
from graphstorm.dataloading import GSgnnNodeTrainData, GSgnnNodeInferData
from graphstorm.dataloading import GSgnnData
from graphstorm.dataloading import GSgnnNodeDataLoader
from graphstorm.eval import GSgnnAccEvaluator
from graphstorm.tracker import GSSageMakerTaskTracker
Expand Down Expand Up @@ -272,11 +272,7 @@ def main(args):
node_feat_fields[node_type] = feat_names.split(',')

# Define the GraphStorm training dataset
train_data = GSgnnNodeTrainData(config.graph_name,
config.part_config,
train_ntypes=config.target_ntype,
node_feat_field=node_feat_fields,
label_field=config.label_field)
train_data = GSgnnData(config.part_config)

# Create input arguments for the HGT model
node_dict = {}
Expand Down Expand Up @@ -311,18 +307,29 @@ def main(args):
trainer = GSgnnNodePredictionTrainer(model, topk_model_to_save=config.topk_model_to_save)
trainer.setup_device(device=get_device())

train_idxs = train_data.get_node_train_set(config.target_ntype)
# Define the GraphStorm train dataloader
dataloader = GSgnnNodeDataLoader(train_data, train_data.train_idxs, fanout=config.fanout,
batch_size=config.batch_size, train_task=True)

dataloader = GSgnnNodeDataLoader(train_data, train_idxs, fanout=config.fanout,
batch_size=config.batch_size,
node_feats=node_feat_fields,
label_field=config.label_field,
train_task=True)

eval_ntype = config.eval_target_ntype
val_idxs = train_data.get_node_val_set(eval_ntype)
test_idxs = train_data.get_node_test_set(eval_ntype)
# Optional: Define the evaluation dataloader
eval_dataloader = GSgnnNodeDataLoader(train_data, train_data.val_idxs,fanout=config.fanout,
eval_dataloader = GSgnnNodeDataLoader(train_data, val_idxs, fanout=config.fanout,
batch_size=config.eval_batch_size,
node_feats=node_feat_fields,
label_field=config.label_field,
train_task=False)

# Optional: Define the evaluation dataloader
test_dataloader = GSgnnNodeDataLoader(train_data, train_data.test_idxs,fanout=config.fanout,
test_dataloader = GSgnnNodeDataLoader(train_data, test_idxs, fanout=config.fanout,
batch_size=config.eval_batch_size,
node_feats=node_feat_fields,
label_field=config.label_field,
train_task=False)

# Optional: set up a evaluator
Expand Down Expand Up @@ -351,18 +358,18 @@ def main(args):
model.restore_model(best_model_path)

# Create a dataset for inference.
infer_data = GSgnnNodeInferData(config.graph_name, config.part_config,
eval_ntypes=config.target_ntype,
node_feat_field=node_feat_fields,
label_field=config.label_field)
infer_data = GSgnnData(config.part_config)

# Create an inference for a node task.
infer = GSgnnNodePredictionInferrer(model)
infer.setup_device(device=get_device())
infer.setup_evaluator(evaluator)
infer.setup_task_tracker(tracker)
dataloader = GSgnnNodeDataLoader(infer_data, infer_data.test_idxs,
infer_idxs = infer_data.get_node_infer_set(eval_ntype)
dataloader = GSgnnNodeDataLoader(infer_data,infer_idxs,
fanout=config.fanout, batch_size=100,
node_feats=node_feat_fields,
label_field=config.label_field,
train_task=False)

# Run inference on the inference dataset and save the GNN embeddings in the specified path.
Expand Down Expand Up @@ -390,7 +397,7 @@ def main(args):
default=argparse.SUPPRESS,
help="Print more information. \
For customized models, MUST have this argument!!")
argparser.add_argument("--local_rank", type=int,
argparser.add_argument("--local-rank", type=int,
help="The rank of the trainer. \
For customized models, MUST have this argument!!")

Expand Down
22 changes: 13 additions & 9 deletions examples/peft_llm_gnn/main_lp.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from graphstorm.config import GSConfig
from graphstorm.dataloading import GSgnnLinkPredictionDataLoader, GSgnnLinkPredictionTestDataLoader
from graphstorm.eval import GSgnnMrrLPEvaluator
from graphstorm.dataloading import GSgnnLPTrainData
from graphstorm.dataloading import GSgnnData
from graphstorm.utils import get_device
from graphstorm.inference import GSgnnLinkPredictionInferrer
from graphstorm.trainer import GSgnnLinkPredictionTrainer
Expand All @@ -20,14 +20,12 @@ def main(config_args):
gs.initialize(ip_config=config.ip_config, backend=config.backend,
local_rank=config.local_rank)
# Define the training dataset
train_data = GSgnnLPTrainData(
config.graph_name,
train_data = GSgnnData(
config.part_config,
train_etypes=config.train_etype,
eval_etypes=config.eval_etype,
label_field=None,
node_feat_field=config.node_feat_name,
)
train_etypes=config.train_etype
eval_etypes=config.eval_etype

model = GNNLLM_LP(
g=train_data.g,
Expand Down Expand Up @@ -69,23 +67,27 @@ def main(config_args):
trainer.setup_task_tracker(tracker)

# create train loader with uniform negative sampling
train_idxs = train_data.get_edge_train_set(train_etypes)
dataloader = GSgnnLinkPredictionDataLoader(
train_data,
train_data.train_idxs,
train_idxs,
fanout=config.fanout,
batch_size=config.batch_size,
num_negative_edges=config.num_negative_edges,
node_feats=config.node_feat_name,
train_task=True,
reverse_edge_types_map=config.reverse_edge_types_map,
exclude_training_targets=config.exclude_training_targets,
)

# create val loader
val_idxs = train_data.get_edge_val_set(eval_etypes)
val_dataloader = GSgnnLinkPredictionTestDataLoader(
train_data,
train_data.val_idxs,
val_idxs,
batch_size=config.eval_batch_size,
num_negative_edges=config.num_negative_edges,
node_feats=config.node_feat_name,
fanout=config.fanout,
)

Expand All @@ -112,11 +114,13 @@ def main(config_args):
infer.setup_evaluator(evaluator)
infer.setup_task_tracker(tracker)
# Create test loader
infer_idxs = train_data.get_edge_infer_set(eval_etypes)
test_dataloader = GSgnnLinkPredictionTestDataLoader(
train_data,
train_data.test_idxs,
infer_idxs,
batch_size=config.eval_batch_size,
num_negative_edges=config.num_negative_edges_eval,
node_feats=config.node_feat_name,
fanout=config.fanout,
)
# Run inference on the inference dataset and save the GNN embeddings in the specified path.
Expand Down
27 changes: 15 additions & 12 deletions examples/peft_llm_gnn/main_nc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from graphstorm.config import GSConfig
from graphstorm.dataloading import GSgnnNodeDataLoader
from graphstorm.eval import GSgnnAccEvaluator
from graphstorm.dataloading import GSgnnNodeTrainData
from graphstorm.dataloading import GSgnnData
from graphstorm.utils import get_device
from graphstorm.inference import GSgnnNodePredictionInferrer
from graphstorm.trainer import GSgnnNodePredictionTrainer
Expand All @@ -18,14 +18,8 @@ def main(config_args):
gs.initialize(ip_config=config.ip_config, backend=config.backend,
local_rank=config.local_rank)
# Define the training dataset
train_data = GSgnnNodeTrainData(
config.graph_name,
config.part_config,
train_ntypes=config.target_ntype,
eval_ntypes=config.eval_target_ntype,
label_field=config.label_field,
node_feat_field=config.node_feat_name,
)
train_data = GSgnnData(
config.part_config)

model = GNNLLM_NC(
g=train_data.g,
Expand Down Expand Up @@ -66,20 +60,26 @@ def main(config_args):
trainer.setup_task_tracker(tracker)

# create train loader
train_idxs = train_data.get_node_train_set(config.target_ntype)
dataloader = GSgnnNodeDataLoader(
train_data,
train_data.train_idxs,
train_idxs,
fanout=config.fanout,
batch_size=config.batch_size,
node_feats=config.node_feat_name,
label_field=config.label_field,
train_task=True,
)

# create val loader
val_idxs = train_data.get_node_val_set(config.eval_target_ntype)
val_dataloader = GSgnnNodeDataLoader(
train_data,
train_data.val_idxs,
val_idxs,
fanout=config.fanout,
batch_size=config.eval_batch_size,
node_feats=config.node_feat_name,
label_field=config.label_field,
train_task=False,
)

Expand All @@ -106,11 +106,14 @@ def main(config_args):
infer.setup_evaluator(evaluator)
infer.setup_task_tracker(tracker)
# Create test loader
test_idxs = train_data.get_node_test_set(config.eval_target_ntype)
test_dataloader = GSgnnNodeDataLoader(
train_data,
train_data.test_idxs,
test_idxs,
fanout=config.fanout,
batch_size=config.eval_batch_size,
node_feats=config.node_feat_name,
label_field=config.label_field,
train_task=False,
)
# Run inference on the inference dataset and save the GNN embeddings in the specified path.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,6 @@ gsf:
use_self_loop: true
udf:
save_result_path: tgat_nc_gpu
eval_target_ntype:
- paper
eval_target_ntypes:
- paper
version: 1.0
29 changes: 16 additions & 13 deletions examples/temporal_graph_learning/main_nc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from graphstorm.config import GSConfig
from graphstorm.dataloading import GSgnnNodeDataLoader
from graphstorm.eval import GSgnnAccEvaluator
from graphstorm.dataloading import GSgnnNodeTrainData
from graphstorm.dataloading import GSgnnData
from graphstorm.utils import get_device
from graphstorm.trainer import GSgnnNodePredictionTrainer

Expand All @@ -18,22 +18,16 @@ def main(config_args):
local_rank=config.local_rank)

# Define the training dataset
train_data = GSgnnNodeTrainData(
config.graph_name,
config.part_config,
train_ntypes=config.target_ntype,
eval_ntypes=config.eval_target_ntype,
label_field=config.label_field,
node_feat_field=config.node_feat_name,
)
train_data = GSgnnData(
config.part_config)

# Define TGAT model
model = create_rgcn_model_for_nc(train_data.g, config)
print(model)

# Create a trainer for NC tasks.
trainer = GSgnnNodePredictionTrainer(
model, gs.get_rank(), topk_model_to_save=config.topk_model_to_save
model, topk_model_to_save=config.topk_model_to_save
)

if config.restore_model_path is not None:
Expand All @@ -57,29 +51,38 @@ def main(config_args):
trainer.setup_evaluator(evaluator)

# create train loader
train_idxs = train_data.get_node_train_set(config.target_ntype)
dataloader = GSgnnNodeDataLoader(
train_data,
train_data.train_idxs,
train_idxs,
fanout=config.fanout,
batch_size=config.batch_size,
node_feats=config.node_feat_name,
label_field=config.label_field,
train_task=True,
)

# create val loader
val_idxs = train_data.get_node_val_set(config.eval_target_ntypes)
val_dataloader = GSgnnNodeDataLoader(
train_data,
train_data.val_idxs,
val_idxs,
fanout=config.fanout,
batch_size=config.eval_batch_size,
node_feats=config.node_feat_name,
label_field=config.label_field,
train_task=False,
)

# create test loader
test_idxs = train_data.get_node_test_set(config.eval_target_ntypes)
test_dataloader = GSgnnNodeDataLoader(
train_data,
train_data.test_idxs,
test_idxs,
fanout=config.fanout,
batch_size=config.eval_batch_size,
node_feats=config.node_feat_name,
label_field=config.label_field,
train_task=False,
)

Expand Down
4 changes: 2 additions & 2 deletions python/graphstorm/model/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,9 +205,9 @@ class GSNodeEncoderInputLayer(GSNodeInputLayer):
from graphstorm import get_node_feat_size
from graphstorm.model import GSgnnNodeModel, GSNodeEncoderInputLayer
from graphstorm.dataloading import GSgnnNodeTrainData
from graphstorm.dataloading import GSgnnData
np_data = GSgnnNodeTrainData(...)
np_data = GSgnnData(...)
model = GSgnnEdgeModel(alpha_l2norm=0)
feat_size = get_node_feat_size(np_data.g, 'feat')
Expand Down
4 changes: 2 additions & 2 deletions python/graphstorm/model/gat_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,10 @@ class GATEncoder(GraphConvEncoder):
from graphstorm.model.gat_encoder import GATEncoder
from graphstorm.model.node_decoder import EntityClassifier
from graphstorm.model import GSgnnNodeModel, GSNodeEncoderInputLayer
from graphstorm.dataloading import GSgnnNodeTrainData
from graphstorm.dataloading import GSgnnData
from graphstorm.model import do_full_graph_inference
np_data = GSgnnNodeTrainData(...)
np_data = GSgnnData(...)
model = GSgnnNodeModel(alpha_l2norm=0)
feat_size = get_node_feat_size(np_data.g, 'feat')
Expand Down
4 changes: 2 additions & 2 deletions python/graphstorm/model/gatv2_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,10 +141,10 @@ class GATv2Encoder(GraphConvEncoder):
from graphstorm.model.gat_encoder import GATv2Encoder
from graphstorm.model.node_decoder import EntityClassifier
from graphstorm.model import GSgnnNodeModel, GSNodeEncoderInputLayer
from graphstorm.dataloading import GSgnnNodeTrainData
from graphstorm.dataloading import GSgnnData
from graphstorm.model import do_full_graph_inference
np_data = GSgnnNodeTrainData(...)
np_data = GSgnnData(...)
model = GSgnnNodeModel(alpha_l2norm=0)
feat_size = get_node_feat_size(np_data.g, 'feat')
Expand Down
4 changes: 2 additions & 2 deletions python/graphstorm/model/hgt_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,10 +314,10 @@ class HGTEncoder(GraphConvEncoder):
from graphstorm.model.hgt_encoder import HGTEncoder
from graphstorm.model.edge_decoder import MLPEdgeDecoder
from graphstorm.model import GSgnnEdgeModel, GSNodeEncoderInputLayer
from graphstorm.dataloading import GSgnnNodeTrainData
from graphstorm.dataloading import GSgnnData
from graphstorm.model import do_full_graph_inference
np_data = GSgnnNodeTrainData(...)
np_data = GSgnnData(...)
model = GSgnnEdgeModel(alpha_l2norm=0)
feat_size = get_node_feat_size(np_data.g, 'feat')
Expand Down
Loading

0 comments on commit 6ccfa64

Please sign in to comment.