From f4d578560e6eee6b991b133de6a9f97005dab232 Mon Sep 17 00:00:00 2001 From: "Jian Zhang (James)" <6593865@qq.com> Date: Mon, 29 Jul 2024 17:37:31 -0700 Subject: [PATCH] [Doc] API doc string refactor for graphstorm.dataloading (#934) *Issue #, if available:* *Description of changes:* This PR refactors the API doc string and the API reference rst pages for graphstorm.dataloading module. The rendered readthedoc pages are: 1. API reference index page: https://james4graphstorm.readthedocs.io/en/james_apidoc_dataloading/api/references/index.html 2. graphstorm.dataset and graphstorm.dataloading index page: https://james4graphstorm.readthedocs.io/en/james_apidoc_dataloading/api/references/graphstorm.dataloading.html All rst pages of classes under graphstorm.dataloading module are updated. Back compatibility breaking changes: 1. Rename the `pos_graph_feat_fields` with `pos_graph_edge_feat_fields` to make its meaning clearer. By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. --------- Co-authored-by: Ubuntu Co-authored-by: Oxfordblue7 <457620544@qq.com> Co-authored-by: xiang song(charlie.song) --- docs/source/_templates/dataloadertemplate.rst | 4 +- docs/source/_templates/datasettemplate.rst | 3 +- .../api/references/graphstorm.dataloading.rst | 46 +- docs/source/graph-construction/index.rst | 2 + python/graphstorm/dataloading/dataloading.py | 736 ++++++++++-------- python/graphstorm/dataloading/dataset.py | 148 ++-- python/graphstorm/trainer/lp_trainer.py | 4 +- python/graphstorm/trainer/mt_trainer.py | 4 +- 8 files changed, 515 insertions(+), 432 deletions(-) diff --git a/docs/source/_templates/dataloadertemplate.rst b/docs/source/_templates/dataloadertemplate.rst index f02d586215..515139624d 100644 --- a/docs/source/_templates/dataloadertemplate.rst +++ b/docs/source/_templates/dataloadertemplate.rst @@ -7,4 +7,6 @@ .. autoclass:: {{ name }} :show-inheritance: - :special-members: __iter__, __next__ \ No newline at end of file + :members: + :member-order: bysource + :special-members: __iter__, __next__, __len__ diff --git a/docs/source/_templates/datasettemplate.rst b/docs/source/_templates/datasettemplate.rst index 1225aa3c46..16e04ade54 100644 --- a/docs/source/_templates/datasettemplate.rst +++ b/docs/source/_templates/datasettemplate.rst @@ -7,4 +7,5 @@ .. autoclass:: {{ name }} :show-inheritance: - :members: prepare_data, get_node_feats, get_edge_feats, get_labels, get_node_feat_size + :members: + :member-order: bysource diff --git a/docs/source/api/references/graphstorm.dataloading.rst b/docs/source/api/references/graphstorm.dataloading.rst index 2ce324888c..db95907702 100644 --- a/docs/source/api/references/graphstorm.dataloading.rst +++ b/docs/source/api/references/graphstorm.dataloading.rst @@ -1,17 +1,34 @@ .. _apidataloading: -graphstorm.dataloading -========================== +graphstorm.dataloading.dataset +=============================== - GraphStorm dataloading module includes a set of graph DataSets and DataLoaders for different - graph machine learning tasks. - - If users would like to customize DataLoaders, please extend those classes in the - :ref:`Base DataLoaders ` section and customize their abstract methods. + GraphStorm dataset provides one unified dataset class, i.e., ``GSgnnData``, for all graph + machine learning tasks. Users can build a ``GSgnnData`` object by giving the path of + the JSON file created by the :ref:`GraphStorm Graph Construction` + operations. The ``GSgnnData`` will load the related graph artifacts specified in the JSON + file. It provides a set of APIs for users to extract information of the graph data for + model training and inference. .. currentmodule:: graphstorm.dataloading -.. _basedataloaders: +.. autosummary:: + :toctree: ../generated/ + :nosignatures: + :template: datasettemplate.rst + + GSgnnData + +graphstorm.dataloading.dataloading +=================================== + + GraphStorm dataloading module includes a set of different DataLoaders for + different graph machine learning tasks. + + If users would like to customize DataLoaders, please extend those dataloader base + classes in the **Base DataLoaders** section and customize their abstract functions. + +.. currentmodule:: graphstorm.dataloading Base DataLoaders ------------------- @@ -25,16 +42,6 @@ Base DataLoaders GSgnnEdgeDataLoaderBase GSgnnLinkPredictionDataLoaderBase -DataSets ------------- - -.. autosummary:: - :toctree: ../generated/ - :nosignatures: - :template: datasettemplate.rst - - GSgnnData - DataLoaders ------------ @@ -44,5 +51,8 @@ DataLoaders :template: dataloadertemplate.rst GSgnnNodeDataLoader + GSgnnNodeSemiSupDataLoader GSgnnEdgeDataLoader GSgnnLinkPredictionDataLoader + GSgnnLinkPredictionTestDataLoader + GSgnnLinkPredictionPredefinedTestDataLoader diff --git a/docs/source/graph-construction/index.rst b/docs/source/graph-construction/index.rst index b917c814f6..4c5d8403f5 100644 --- a/docs/source/graph-construction/index.rst +++ b/docs/source/graph-construction/index.rst @@ -1,3 +1,5 @@ +.. _graph_construction: + ================== Graph Construction ================== diff --git a/python/graphstorm/dataloading/dataloading.py b/python/graphstorm/dataloading/dataloading.py index 1020529857..d3c9728770 100644 --- a/python/graphstorm/dataloading/dataloading.py +++ b/python/graphstorm/dataloading/dataloading.py @@ -117,7 +117,7 @@ class MultiLayerNeighborSamplerForReconstruct(dgl.dataloading.BlockSampler): construct_feat_ntype : list of str The node types that requires to construct node features. construct_feat_fanout : int - The fanout required to construct node features. + The fanout used when constructing node features for feature-less nodes. """ def __init__(self, sampler, dataset, construct_feat_ntype, construct_feat_fanout): super().__init__() @@ -151,38 +151,45 @@ def sample_blocks(self, g, seed_nodes, exclude_eids=None): class GSgnnEdgeDataLoaderBase(): """ The base dataloader class for edge tasks. - If users want to customize the dataloader for edge prediction tasks + If users want to customize dataloaders for edge prediction tasks, they should extend this base class by implementing the special methods - `__iter__` and `__next__`. + ``__iter__``, ``__next__``, and ``__len__``. Parameters ---------- dataset : GSgnnData - The dataset for the edge task. + The GraphStorm data for edge tasks. target_idx : dict of Tensors - The target edge IDs. + The target edge indexes for prediction. fanout : list or dict of lists - The fanout for each GNN layer. + The fanout for each GNN layer. If it's a dict of lists, it indicates the fanout for each + edge type. label_field: str or dict of str Label field of the edge task. - node_feats: str, or dist of list of str - Node features. - str: All the nodes have the same feature name. - list of string: All the nodes have the same list of features. - dist of list of string: Each node type have different set of node features. - Default: None - edge_feats: str, or dist of list of str - Edge features. - str: All the edges have the same feature name. - list of string: All the edges have the same list of features. - dist of list of string: Each edge type have different set of edge features. - Default: None - decoder_edge_feats: str or dict of list of str - Edge features used in decoder. - str: All the edges have the same feature name. - list of string: All the edges have the same list of features. - dist of list of string: Each edge type have different set of edge features. - Default: None + node_feats: str, or dict of list of str + Node feature fileds in three possible formats: + + - string: All nodes have the same feature name. + - list of string: All nodes have the same list of features. + - dict of list of string: Each node type have different set of node features. + + Default: None. + edge_feats: str, or dict of list of str + Edge feature fileds in three possible formats: + + - string: All edges have the same feature name. + - list of string: All edges have the same list of features. + - dict of list of string: Each edge type have different set of edge features. + + Default: None. + decoder_edge_feats: str, or dict of list of str + Edge feature fileds used in edge decoders in three possible formats: + + - string: All edges have the same feature name. + - list of string: All edges have the same list of features. + - dict of list of string: Each edge type have different set of edge features. + + Default: None. """ def __init__(self, dataset, target_idx, fanout, label_field, node_feats=None, edge_feats=None, decoder_edge_feats=None): @@ -199,154 +206,168 @@ def __init__(self, dataset, target_idx, fanout, self._decoder_edge_feats = decoder_edge_feats def __iter__(self): - """ Returns an iterator object + """ Returns an iterator object. """ def __next__(self): """ Return a mini-batch data for the edge task. - A mini-batch comprises three objects: the input node IDs, - the target edges and the subgraph blocks for message passing. + A mini-batch comprises three objects: 1) the input node IDs, + 2) the target edges, and 3) the subgraph blocks for message passing. Returns ------- - dict of Tensors : the input node IDs of the mini-batch. - DGLGraph : the target edges. - list of DGLGraph : the subgraph blocks for message passing. + + - dict of Tensors : the input node IDs of the mini-batch. + - DGLGraph : the target edges. + - list of DGLGraph : the subgraph blocks for message passing. + """ def __len__(self): - """ Return the length (number of mini-batches) of the data loader + """ Return the length (number of mini-batches) of the data loader. Returns + ------- int: length """ @property def data(self): - """ The dataset of this dataloader. + """ The dataset of this dataloader, which is given in class initialization. Returns ------- - GSgnnData : The dataset of the dataloader. + GSgnnData: The dataset of the dataloader. """ return self._data @property def target_eidx(self): - """ Target edge idx for prediction + """ Target edge indexes for prediction, which is given in class initialization. Returns ------- - dict of Tensors : the target edge IDs. + dict of Tensors: the target edge IDs, which is given in class initialization. """ return self._target_eidx @property def fanout(self): - """ The fan out of each GNN layers + """ The fan out of each GNN layers, which is given in class initialization. Returns ------- - list or a dict of list : the fanouts for each GNN layer. + list or a dict of list: the fanouts for each GNN layer, which is given in class + initialization. """ return self._fanout @property def label_field(self): - """ The label field + """ The label field, which is given in class initialization. Returns ------- - str: Label fields in the graph. + str: Label fields in the graph, which is given in class initialization. """ return self._label_field @property def node_feat_fields(self): - """ Node features + """ Node feature fields, which is given in class initialization. Returns ------- - str or dict of list of str: Node feature fields in the graph. + str or dict of list of str: Node feature fields in the graph, which is given in class + initialization. """ return self._node_feats @property def edge_feat_fields(self): - """ Edge features + """ Edge feature fields, which is given in class initialization. Returns ------- - str or dict of list of str: Node feature fields in the graph. + str or dict of list of str: Node feature fields in the graph, which is given in class + initialization. """ return self._edge_feats @property def decoder_edge_feat_fields(self): - """ Edge features for edge decoder. + """ Edge features for edge decoder, which is given in class initialization. Returns ------- - str or dict of list of str: Node feature fields in the graph. + str or dict of list of str: Node feature fields in the graph, which is given in class + initialization. """ return self._decoder_edge_feats class GSgnnEdgeDataLoader(GSgnnEdgeDataLoaderBase): - """ The minibatch dataloader for edge prediction + """ The mini-batch dataloader for edge prediction tasks. - GSgnnEdgeDataLoader samples GraphStorm edge dataset into an iterable over mini-batches - of samples. Both source and destination nodes are included in the batch_graph, which + ``GSgnnEdgeDataLoader`` samples target edges into an iterable over mini-batches + of samples. Both source and destination nodes are included in the ``batch_graph``, which will be used by GraphStorm Trainers and Inferrers. Parameters ------------ dataset: GSgnnData - The GraphStorm edge dataset + The GraphStorm data. target_idx : dict of Tensors - The target edges for prediction - fanout: list of int or dict of list - Neighbor sample fanout. If it's a dict, it indicates the fanout for each edge type. + The target edge indexes for prediction. + fanout: list of int, or dict of list + Neighbor sampling fanout. If it's a dict of list, it indicates the fanout for each + edge type. batch_size: int - Batch size + Mini-batch size. label_field: str or dict of str Label field of the edge task. - node_feats: str, or dist of list of str - Node features. - str: All the nodes have the same feature name. - list of string: All the nodes have the same list of features. - dist of list of string: Each node type have different set of node features. - Default: None - edge_feats: str, or dist of list of str - Edge features. - str: All the edges have the same feature name. - list of string: All the edges have the same list of features. - dist of list of string: Each edge type have different set of edge features. - Default: None - decoder_edge_feats: str or dict of list of str - Edge features used in decoder. - str: All the edges have the same feature name. - list of string: All the edges have the same list of features. - dist of list of string: Each edge type have different set of edge features. - Default: None + node_feats: str, or dict of list of str + Node feature fileds in three possible formats: + + - string: All nodes have the same feature name. + - list of string: All nodes have the same list of features. + - dict of list of string: Each node type have different set of node features. + + Default: None. + edge_feats: str, or dict of list of str + Edge features fileds in three possible formats: + + - string: All edges have the same feature name. + - list of string: All edges have the same list of features. + - dict of list of string: Each edge type have different set of edge features. + + Default: None. + decoder_edge_feats: str, or dict of list of str + Edge features used in edge decoders in three possible formats: + + - string: All edges have the same feature name. + - list of string: All edges have the same list of features. + - dict of list of string: Each edge type have different set of edge features. + + Default: None. train_task : bool - Whether or not for training. + Whether or not is the dataloader for training. reverse_edge_types_map: dict - A map for reverse edge type + A map for reverse edge type. exclude_training_targets: bool - Whether to exclude training edges during neighbor sampling + Whether to exclude training edges during neighbor sampling. remove_target_edge_type: bool - Whether we will exclude all edges of the target edge type in message passing. + Whether to exclude all edges of the target edge type in message passing. construct_feat_ntype : list of str The node types that requires to construct node features. construct_feat_fanout : int - The fanout required to construct node features. + The fanout used when constructing node features for feature-less nodes. Examples ------------ To train a 2-layer GNN for edge prediction on a set of edges ``target_idx`` on - a graph where each nodes takes messages from 15 neighbors on the first layer - and 10 neighbors on the second. + a graph where each edge (source and destination node pair) takes messages from 15 + neighbors on the first layer and 10 neighbors on the second. .. code:: python @@ -443,29 +464,17 @@ def __next__(self): return self.dataloader.__next__() def __len__(self): - # Follow - # https://github.com/dmlc/dgl/blob/1.0.x/python/dgl/distributed/dist_dataloader.py#L116 - # In DGL, DistDataLoader.expected_idxs is the length (number of batches) - # of the datalaoder. - return self.dataloader.expected_idxs - - @property - def data(self): - """ The dataset of this dataloader. - """ - return self._data - - @property - def target_eidx(self): - """ Target edge idx for prediction """ - return self._target_eidx + Follow + https://github.com/dmlc/dgl/blob/1.0.x/python/dgl/distributed/dist_dataloader.py#L116. + In DGL, ``DistDataLoader.expected_idxs`` is the length (number of batches) + of the dataloader. - @property - def fanout(self): - """ The fan out of each GNN layers + Returns: + -------- + int: The length (number of batches) of the dataloader. """ - return self._fanout + return self.dataloader.expected_idxs ################ Minibatch DataLoader (Link Prediction) ####################### @@ -483,36 +492,41 @@ def fanout(self): BUILTIN_LP_FIXED_NEG_SAMPLER = 'fixed' class GSgnnLinkPredictionDataLoaderBase(): - """ The base class of link prediction dataloader. + """ The base dataloader class for link prediction tasks. - If users want to customize the dataloader for link prediction tasks + If users want to customize dataloaders for link prediction tasks, they should extend this base class by implementing the special methods - `__iter__` and `__next__`. + ``__iter__``, ``__next__``, and ``__len__``. Parameters ---------- dataset: GSgnnData - The GraphStorm edge dataset + The GraphStorm data for link prediction tasks. target_idx : dict of Tensors - The target edges for prediction - fanout: list of int or dict of list - Neighbor sample fanout. If it's a dict, it indicates the fanout for each edge type. - node_feats: str, or dist of list of str - Node features. - str: All the nodes have the same feature name. - list of string: All the nodes have the same list of features. - dist of list of string: Each node type have different set of node features. - Default: None - edge_feats: str, or dist of list of str - Edge features. - str: All the edges have the same feature name. - list of string: All the edges have the same list of features. - dist of list of string: Each edge type have different set of edge features. - Default: None - pos_graph_edge_feats: str or dist of list of str + The target edge indexes for link prediction. + fanout: list of int, or dict of list + Neighbor sampling fanout. If it's a dict of list, it indicates the fanout for each + edge type. + node_feats: str, or dict of list of str + Node feature fileds in three possible formats: + + - string: All nodes have the same feature name. + - list of string: All nodes have the same list of features. + - dict of list of string: Each node type have different set of node features. + + Default: None. + edge_feats: str, or dict of list of str + Edge feature fileds in three possible formats: + + - string: All edges have the same feature name. + - list of string: All edges have the same list of features. + - dict of list of string: Each edge type have different set of edge features. + + Default: None. + pos_graph_edge_feats: str, or dict of list of str The field of the edge features used by positive graph in link prediction. - For example edge weight. - Default: None + For example edge weights. + Default: None. """ def __init__(self, dataset, target_idx, fanout, node_feats=None, edge_feats=None, pos_graph_edge_feats=None): @@ -527,36 +541,40 @@ def __init__(self, dataset, target_idx, fanout, self._pos_graph_edge_feats = pos_graph_edge_feats def __iter__(self): - """ Returns an iterator object + """ Returns an iterator object. """ def __next__(self): """ Return a mini-batch for link prediction. A mini-batch of link prediction contains four objects: - * the input node IDs of the mini-batch, - * the target positive edges for prediction, - * the negative edges for prediction, - * the subgraph blocks for message passing. + + - the input node IDs of the mini-batch. + - the target positive edges for prediction. + - the sampled negative edges for prediction. + - the subgraph blocks for message passing. Returns ------- - Tensor or dict of Tensors : the input nodes of a mini-batch. - DGLGraph : positive edges. - DGLGraph : negative edges. - list of DGLGraph : subgraph blocks for message passing. + + - Tensor or dict of Tensors: the input nodes of a mini-batch. + - DGLGraph: positive edges. + - DGLGraph: negative edges. + - list of DGLGraph: subgraph blocks for message passing. + """ def __len__(self): - """ Return the length (number of mini-batches) of the data loader + """ Return the length (number of mini-batches) of the data loader. Returns + ------- int: length """ @property def data(self): - """ The dataset of this dataloader. + """ The dataset of this dataloader, which is given in class initialization. Returns ------- @@ -566,7 +584,7 @@ def data(self): @property def fanout(self): - """ The fan out of each GNN layers + """ The fan out of each GNN layers, which is given in class initialization. Returns ------- @@ -576,7 +594,7 @@ def fanout(self): @property def target_eidx(self): - """ The target edges for prediction. + """ The target edge indexes for prediction, which is given in class initialization. Returns ------- @@ -586,7 +604,7 @@ def target_eidx(self): @property def node_feat_fields(self): - """ Node features + """ Node feature fields, which is given in class initialization. Returns ------- @@ -596,85 +614,92 @@ def node_feat_fields(self): @property def edge_feat_fields(self): - """ Edge features + """ Edge feature fields, which is given in class initialization. Returns ------- - str or dict of list of str: Node feature fields in the graph. + str or dict of list of str: Edge feature fields in the graph. """ return self._edge_feats @property - def pos_graph_feat_fields(self): - """ Get edge feature fields of positive graphs + def pos_graph_edge_feat_fields(self): + """ Get edge feature fields of positive graphs, which is given in class initialization. Returns ------- - str or dict of list of str: Node feature fields in the graph. + str or dict of list of str: Edge feature fields in the positive graph. """ return self._pos_graph_edge_feats class GSgnnLinkPredictionDataLoader(GSgnnLinkPredictionDataLoaderBase): - """ Link prediction minibatch dataloader + """ Mini-batch dataloader for link prediction. - GSgnnLinkPredictionDataLoader samples GraphStorm edge dataset into an iterable over mini-batches - of samples. In each batch, pos_graph and neg_graph are sampled subgraph for positive and - negative edges, which will be used by GraphStorm Trainers and Inferrers. Given a positive edge, - a negative edge is composed of the source node and a random negative destination nodes - according to a uniform distribution. + ``GSgnnLinkPredictionDataLoader`` samples GraphStorm data into an iterable over mini-batches + of samples. In each batch, ``pos_graph`` and ``neg_graph`` are sampled subgraph for positive + and negative edges, which will be used by GraphStorm Trainers and Inferrers. + + Given a positive edge, a negative edge is composed of the source node and a random negative + destination nodes according to a uniform distribution. Argument -------- dataset: GSgnnData - The GraphStorm edge dataset + The GraphStorm data. target_idx : dict of Tensors - The target edges for prediction - fanout: list of int or dict of list - Neighbor sample fanout. If it's a dict, it indicates the fanout for each edge type. + The target edge indexes for prediction. + fanout: list of int, or dict of list + Neighbor sampling fanout. If it's a dict of list, it indicates the fanout for each + edge type. batch_size: int - Batch size + Mini-batch size. num_negative_edges: int - The number of negative edges per positive edge - node_feats: str, or dist of list of str - Node features. - str: All the nodes have the same feature name. - list of string: All the nodes have the same list of features. - dist of list of string: Each node type have different set of node features. - Default: None - edge_feats: str, or dist of list of str - Edge features. - str: All the edges have the same feature name. - list of string: All the edges have the same list of features. - dist of list of string: Each edge type have different set of edge features. - Default: None - pos_graph_edge_feats: str or dist of list of str - The field of the edge features used by positive graph in link prediction. + The number of negative edges per positive edge. + node_feats: str, or dict of list of str + Node feature fileds in three possible formats: + + - string: All nodes have the same feature name. + - list of string: All nodes have the same list of features. + - dict of list of string: Each node type have different set of node features. + + Default: None. + edge_feats: str, or dict of list of str + Edge feature fileds in three possible formats: + + - string: All edges have the same feature name. + - list of string: All edges have the same list of features. + - dict of list of string: Each edge type have different set of edge features. + + Default: None. + pos_graph_edge_feats: str, or dict of list of str + The edge feature fields used by positive graph in link prediction. For example edge weight. - Default: None + Default: None. train_task : bool - Whether or not for training. + Whether or not it is a dataloader for training. reverse_edge_types_map: dict - A map for reverse edge type + A map for reverse edge type. exclude_training_targets: bool - Whether to exclude training edges during neighbor sampling + Whether to exclude training edges during neighbor sampling. edge_mask_for_gnn_embeddings : str - The mask that indicates the edges used for computing GNN embeddings. By default, + The mask indicates the edges used for computing GNN embeddings. By default, the dataloader uses the edges in the training graphs to compute GNN embeddings to avoid information leak for link prediction. construct_feat_ntype : list of str The node types that requires to construct node features. construct_feat_fanout : int - The fanout required to construct node features. - edge_dst_negative_field: str or dict of str - The feature field(s) that store the hard negative edges for each edge type. - num_hard_negs: int or dict of int - The number of hard negatives per positive edge for each edge type + The fanout used when constructing node features for feature-less nodes. + edge_dst_negative_field: str, or dict of str + The feature fields that store the hard negative edges for each edge type. + num_hard_negs: int, or dict of int + The number of hard negatives per positive edge for each edge type. Examples ------------ To train a 2-layer GNN for link prediction on a set of positive edges ``target_idx`` on - a graph where each nodes takes messages from 15 neighbors on the first layer - and 10 neighbors on the second. We use 10 negative edges per positive in this example. + a graph where each edge (a source and destination node pair) takes messages from 15 neighbors + on the first layer and 10 neighbors on the second. + We use 10 negative edges per positive in this example. .. code:: python @@ -785,10 +810,16 @@ def __next__(self): return self.dataloader.__next__() def __len__(self): - # Follow - # https://github.com/dmlc/dgl/blob/1.0.x/python/dgl/distributed/dist_dataloader.py#L116 - # In DGL, DistDataLoader.expected_idxs is the length (number of batches) - # of the datalaoder. + """ + Follow + https://github.com/dmlc/dgl/blob/1.0.x/python/dgl/distributed/dist_dataloader.py#L116. + In DGL, ``DistDataLoader.expected_idxs`` is the length (number of batches) + of the dataloader. + + Returns: + -------- + int: The length (number of batches) of the dataloader. + """ return self.dataloader.expected_idxs class GSgnnLPJointNegDataLoader(GSgnnLinkPredictionDataLoader): @@ -929,7 +960,7 @@ def _prepare_negative_sampler(self, num_negative_edges): class AllEtypeDistEdgeDataLoader(DistDataLoader): """ Distributed edge data sampler that samples at least one - edge for each edge type in a minibatch + edge for each edge type in a mini-batch Parameters ---------- @@ -978,7 +1009,7 @@ def _reinit_dataset(self): bs_per_type = {} for etype, idxs in self.data_idx.items(): # compute the number of edges to be sampled for - # each edge type in a minibatch. + # each edge type in a mini-batch. # If batch_size * num_edges / total_edges < 0, then set 1. # # Note: The resulting batch size of a mini batch may be larger @@ -1066,7 +1097,7 @@ def _next_data(self): return new_ret class GSgnnAllEtypeLinkPredictionDataLoader(GSgnnLinkPredictionDataLoader): - """ Link prediction minibatch dataloader. In each minibatch, + """ Link prediction mini-batch dataloader. In each mini-batch, at least one edge is sampled from each etype. Note: using this dataloader with a graph with massive etypes @@ -1147,15 +1178,15 @@ def __next__(self): def __len__(self): # Follow - # https://github.com/dmlc/dgl/blob/1.0.x/python/dgl/distributed/dist_dataloader.py#L116 + # https://github.com/dmlc/dgl/blob/1.0.x/python/dgl/distributed/dist_dataloader.py#L116. # In DGL, DistDataLoader.expected_idxs is the length (number of batches) - # of the datalaoder. + # of the dataloader. # AllEtypeDistEdgeDataLoader is a child class of DistDataLoader. return self.dataloader.expected_idxs class GSgnnAllEtypeLPJointNegDataLoader(GSgnnAllEtypeLinkPredictionDataLoader): """ Link prediction dataloader with joint negative sampler. - In each minibatch, at least one edge is sampled from each etype. + In each mini-batch, at least one edge is sampled from each etype. """ @@ -1165,47 +1196,52 @@ def _prepare_negative_sampler(self, num_negative_edges): return negative_sampler class GSgnnLinkPredictionTestDataLoader(GSgnnLinkPredictionDataLoaderBase): - """ Link prediction minibatch dataloader for validation and test. + """ Mini-batch dataloader for link prediction validation and test. In order to efficiently compute positive and negative scores for - link prediction tasks, GSgnnLinkPredictionTestDataLoader is designed - to only generates edges, i.e., (src, dst) pairs. + link prediction tasks, ``GSgnnLinkPredictionTestDataLoader`` is designed + to only generates edges, i.e., source and destination node pairs. The negative edges are sampled uniformly. Parameters ----------- dataset: GSgnnData - The GraphStorm edge dataset + The GraphStorm data. target_idx : dict of Tensors - The target edges for prediction + The target edge indexes for link prediction. batch_size: int - Batch size + Mini-batch size. num_negative_edges: int - The number of negative edges per positive edge - fanout: int - Evaluation fanout for computing node embedding + The number of negative edges per positive edge. + fanout: list of int, or dict of list + Neighbor sampling fanout. If it's a dict of list, it indicates the fanout for each + edge type. fixed_test_size: int Fixed number of test data used in evaluation. If it is none, use the whole testset. - When test is huge, using fixed_test_size + When test is huge, using `fixed_test_size` can save validation and test time. Default: None. - node_feats: str, or dist of list of str - Node features. - str: All the nodes have the same feature name. - list of string: All the nodes have the same list of features. - dist of list of string: Each node type have different set of node features. - Default: None - edge_feats: str, or dist of list of str - Edge features. - str: All the edges have the same feature name. - list of string: All the edges have the same list of features. - dist of list of string: Each edge type have different set of edge features. - Default: None - pos_graph_edge_feats: str or dist of list of str - The field of the edge features used by positive graph in link prediction. + node_feats: str, or dict of list of str + Node feature fileds in three possible formats: + + - string: All nodes have the same feature name. + - list of string: All nodes have the same list of features. + - dict of list of string: Each node type have different set of node features. + + Default: None. + edge_feats: str, or dict of list of str + Edge feature fileds in three possible formats: + + - string: All edges have the same feature name. + - list of string: All edges have the same list of features. + - dict of list of string: Each edge type have different set of edge features. + + Default: None. + pos_graph_edge_feats: str or dict of list of str + The edge feature fields used by positive graph in link prediction. For example edge weight. - Default: None + Default: None. """ def __init__(self, dataset, target_idx, batch_size, num_negative_edges, fanout=None, fixed_test_size=None, @@ -1285,15 +1321,10 @@ def __len__(self): num_iters += math.ceil(test_size / self._batch_size) return num_iters - @property - def fanout(self): - """ Get eval fanout - """ - return self._fanout class GSgnnLinkPredictionJointTestDataLoader(GSgnnLinkPredictionTestDataLoader): - """ Link prediction minibatch dataloader for validation and test - with joint negative sampler + """ Mini-batch dataloader for Link prediction validation and test set + with joint negative sampler. """ def _prepare_negative_sampler(self, num_negative_edges): @@ -1303,43 +1334,48 @@ def _prepare_negative_sampler(self, num_negative_edges): return negative_sampler class GSgnnLinkPredictionPredefinedTestDataLoader(GSgnnLinkPredictionTestDataLoader): - """ Link prediction minibatch dataloader for validation and test - with predefined negatives. + """ Mini-batch dataloader for link prediction validation and test + with predefined negatives. Parameters ----------- dataset: GSgnnData - The GraphStorm edge dataset + The GraphStorm data. target_idx : dict of Tensors - The target edges for prediction + The target edge indexes for link prediction. batch_size: int - Batch size - fanout: int - Evaluation fanout for computing node embedding + Mini-batch size. + fanout: list of int, or dict of list + Neighbor sampling fanout. If it's a dict of list, it indicates the fanout for each + edge type. fixed_test_size: int Fixed number of test data used in evaluation. If it is none, use the whole testset. - When test is huge, using fixed_test_size + When test is huge, using `fixed_test_size` can save validation and test time. Default: None. - fixed_edge_dst_negative_field: str or list of str - The feature field(s) that store the fixed negative set for each edge. - node_feats: str, or dist of list of str - Node features. - str: All the nodes have the same feature name. - list of string: All the nodes have the same list of features. - dist of list of string: Each node type have different set of node features. - Default: None - edge_feats: str, or dist of list of str - Edge features. - str: All the edges have the same feature name. - list of string: All the edges have the same list of features. - dist of list of string: Each edge type have different set of edge features. - Default: None - pos_graph_edge_feats: str or dist of list of str - The field of the edge features used by positive graph in link prediction. + fixed_edge_dst_negative_field: str, or list of str + The feature fields that store the fixed negative set for each edge. + node_feats: str, or dict of list of str + Node feature fileds in three possible formats: + + - string: All nodes have the same feature name. + - list of string: All nodes have the same list of features. + - dict of list of string: Each node type have different set of node features. + + Default: None. + edge_feats: str, or dict of list of str + Edge feature fileds in three possible formats: + + - string: All edges have the same feature name. + - list of string: All edges have the same list of features. + - dict of list of string: Each edge type have different set of edge features. + + Default: None. + pos_graph_edge_feats: str, or dict of list of str + The edge feature fields used by positive graph in link prediction. For example edge weight. - Default: None + Default: None. """ def __init__(self, dataset, target_idx, batch_size, fixed_edge_dst_negative_field, fanout=None, fixed_test_size=None, @@ -1378,32 +1414,36 @@ def _next_data(self, etype): class GSgnnNodeDataLoaderBase(): """ The base dataloader class for node tasks. - If users want to customize the dataloader for node prediction tasks + If users want to customize dataloaders for their node prediction tasks, they should extend this base class by implementing the special methods - `__iter__` and `__next__`. + ``__iter__``, ``__next__``, and ``__len__``. Parameters ---------- dataset : GSgnnData - The dataset for the node task. + The GraphStorm data for node tasks. target_idx : dict of Tensors - The target node IDs. - fanout : list or dict of lists + The target node indexes for prediction. + fanout : list of int, or dict of lists The fanout for each GNN layer. - label_field: str or dict of str - Label field of the node task. - node_feats: str, or dist of list of str - Node features. - str: All the nodes have the same feature name. - list of string: All the nodes have the same list of features. - dist of list of string: Each node type have different set of node features. - Default: None - edge_feats: str, or dist of list of str - Edge features. - str: All the edges have the same feature name. - list of string: All the edges have the same list of features. - dist of list of string: Each edge type have different set of edge features. - Default: None + label_field: str, or dict of str + Label field name of the target node types. + node_feats: str, or dict of list of str + Node feature fileds in three possible formats: + + - string: All nodes have the same feature name. + - list of string: All nodes have the same list of features. + - dict of list of string: Each node type have different set of node features. + + Default: None. + edge_feats: str, or dict of list of str + Edge feature fileds in three possible formats: + + - string: All edges have the same feature name. + - list of string: All edges have the same list of features. + - dict of list of string: Each edge type have different set of edge features. + + Default: None. """ def __init__(self, dataset, target_idx, fanout, label_field, node_feats=None, edge_feats=None): @@ -1418,132 +1458,140 @@ def __init__(self, dataset, target_idx, fanout, self._edge_feats = edge_feats def __iter__(self): - """ Returns an iterator object + """ Returns an iterator object. """ def __next__(self): - """ Return a mini-batch data for the node task. + """ Return a mini-batch data for node tasks. - A mini-batch comprises three objects: the input node IDs of the mini-batch, - the target nodes and the subgraph blocks for message passing. + A mini-batch comprises three objects: 1) the input node IDs of the mini-batch, + 2) the target nodes, and 3) the subgraph blocks for message passing. Returns ------- - dict of Tensors : the input node IDs of the mini-batch. - dict of Tensors : the target node IDs. - list of DGLGraph : the subgraph blocks for message passing. + + - dict of Tensors : the input node IDs of the mini-batch. + - dict of Tensors : the target node indexes. + - list of DGLGraph : the subgraph blocks for message passing. + """ def __len__(self): - """ Return the length (number of mini-batches) of the data loader + """ Return the length (number of mini-batches) of the dataloader. Returns + ------- int: length """ @property def data(self): - """ The dataset of this dataloader. + """ The data of the dataloader, which is given in class initialization. Returns ------- - GSgnnData : The dataset of the dataloader. + GSgnnData : The data of the dataloader. """ return self._data @property def target_nidx(self): - """ Target edge idx for prediction + """ Target edge indexes for prediction , which is given in class initialization. Returns ------- - dict of Tensors : the target edge IDs. + dict of Tensors : the target edge indexes. """ return self._target_idx @property def fanout(self): - """ The fan out of each GNN layers + """ The fan out of each GNN layers , which is given in class initialization. Returns ------- - list or a dict of list : the fanouts for each GNN layer. + list or a dict of list : the fanouts for each GNN layer , which is given in class + initialization. """ return self._fanout @property def label_field(self): - """ The label field + """ The label field, which is given in class initialization. Returns ------- - str: Label fields in the graph. + str, or dict of str: Label fields, which is given in class initialization. """ return self._label_field @property def node_feat_fields(self): - """ Node features + """ Node features fileds, which is given in class initialization. Returns ------- - str or dict of list of str: Node feature fields in the graph. + str, or dict of list of str: Node feature fields, which is given in class initialization. """ return self._node_feats @property def edge_feat_fields(self): - """ Edge features + """ Edge features fields, which is given in class initialization. Returns ------- - str or dict of list of str: Node feature fields in the graph. + str, or dict of list of str: Edge feature fields, which is given in class initialization. """ return self._edge_feats class GSgnnNodeDataLoader(GSgnnNodeDataLoaderBase): - """ Minibatch dataloader for node tasks + """ Mini-batch dataloader for node tasks. - GSgnnNodeDataLoader samples GraphStorm node dataset into an iterable over mini-batches of - samples including target nodes and sampled neighbor nodes, which will be used by GraphStorm + ``GSgnnNodeDataLoader`` samples GraphStorm data into an iterable over mini-batches of + samples, including target nodes and sampled neighbor nodes, which will be used by GraphStorm Trainers and Inferrers. Parameters ---------- dataset: GSgnnData - The GraphStorm dataset + The GraphStorm data. target_idx : dict of Tensors - The target nodes for prediction - fanout: list of int or dict of list - Neighbor sample fanout. If it's a dict, it indicates the fanout for each edge type. + The target node indexes for prediction. + fanout: list of int, or dict of list + Neighbor sampling fanout. If it's a dict of list, it indicates the fanout for each + edge type. label_field: str Label field of the node task. - (TODO:xiangsx) Support list of str for single dataloader multiple node tasks. - node_feats: str, list of str or dist of list of str - Node features. - str: All the nodes have the same feature name. - list of string: All the nodes have the same list of features. - dist of list of string: Each node type have different set of node features. - Default: None - edge_feats: str, list of str or dist of list of str - Edge features. - str: All the edges have the same feature name. - list of string: All the edges have the same list of features. - dist of list of string: Each edge type have different set of edge features. - Default: None + node_feats: str, list of str or dict of list of str + Node feature fileds in three possible formats: + + - string: All nodes have the same feature name. + - list of string: All nodes have the same list of features. + - dict of list of string: Each node type have different set of node features. + + Default: None. + edge_feats: str, list of str or dict of list of str + Edge feature fileds in three possible formats: + + - string: All edges have the same feature name. + - list of string: All edges have the same list of features. + - dict of list of string: Each edge type have different set of edge features. + + Default: None. batch_size: int - Batch size + Mini-batch size. train_task : bool - Whether or not for training. + Whether or not it is the dataloader for training. construct_feat_ntype : list of str The node types that requires to construct node features. construct_feat_fanout : int - The fanout required to construct node features. + The fanout used when constructing node features for feature-less nodes. Examples ---------- To train a 2-layer GNN for node classification on a set of nodes ``target_idx`` on - a graph where each nodes takes messages from 15 neighbors on the first layer + a graph where each node takes messages from 15 neighbors on the first layer and 10 neighbors on the second. .. code:: python @@ -1614,48 +1662,57 @@ def __next__(self): return self.dataloader.__next__() def __len__(self): - # Follow - # https://github.com/dmlc/dgl/blob/1.0.x/python/dgl/distributed/dist_dataloader.py#L116 - # In DGL, DistDataLoader.expected_idxs is the length (number of batches) - # of the datalaoder. + """ Follow the + https://github.com/dmlc/dgl/blob/1.0.x/python/dgl/distributed/dist_dataloader.py#L116. + In DGL, ``DistDataLoader.expected_idxs`` is the length (number of batches) + of the dataloader. + + Returns: + -------- + int: The length (number of batches) of the dataloader. + """ return self.dataloader.expected_idxs class GSgnnNodeSemiSupDataLoader(GSgnnNodeDataLoader): - """ Semisupervised Minibatch dataloader for node tasks + """ Semi-supervised mini-batch dataloader for node tasks. Parameters ---------- dataset: GSgnnData - The GraphStorm dataset + The GraphStorm data. target_idx : dict of Tensors - The target nodes for prediction + The target node indexes for prediction. unlabeled_idx : dict of Tensors - The unlabeled nodes for semi-supervised training - fanout: list of int or dict of list - Neighbor sample fanout. If it's a dict, it indicates the fanout for each edge type. + The unlabeled node indexes for semi-supervised training. + fanout: list of int, or dict of list + Neighbor sampling fanout. If it's a dict of list, it indicates the fanout for each + edge type. batch_size: int - Batch size, the sum of labeled and unlabeled nodes + Mini-batch size, the sum of labeled and unlabeled nodes label_field: str Label field of the node task. - (TODO:xiangsx) Support list of str for single dataloader multiple node tasks. - node_feats: str, list of str or dist of list of str - Node features. - str: All the nodes have the same feature name. - list of string: All the nodes have the same list of features. - dist of list of string: Each node type have different set of node features. + node_feats: str, list of str, or dict of list of str + Node feature fileds in three possible formats: + + - string: All nodes have the same feature name. + - list of string: All nodes have the same list of features. + - dict of list of string: Each node type have different set of node features. + Default: None - edge_feats: str, list of str or dist of list of str - Edge features. - str: All the edges have the same feature name. - list of string: All the edges have the same list of features. - dist of list of string: Each edge type have different set of edge features. + edge_feats: str, list of str, or dict of list of str + Edge feature fileds in three possible formats: + + - string: All edges have the same feature name. + - list of string: All edges have the same list of features. + - dict of list of string: Each edge type have different set of edge features. + Default: None train_task : bool - Whether or not for training. + Whether or not it is the dataloader for training. construct_feat_ntype : list of str The node types that requires to construct node features. construct_feat_fanout : int - The fanout required to construct node features. + The fanout used when constructing node features for feature-less nodes. """ def __init__(self, dataset, target_idx, unlabeled_idx, fanout, batch_size, label_field, @@ -1683,12 +1740,17 @@ def __next__(self): return self.dataloader.__next__(), self.unlabeled_dataloader.__next__() def __len__(self): - # Follow - # https://github.com/dmlc/dgl/blob/1.0.x/python/dgl/distributed/dist_dataloader.py#L116 - # In DGL, DistDataLoader.expected_idxs is the length (number of batches) - # of the datalaoder. - # As it uses two dataloader, either one throws - # an End of Iter error will stop the dataloader. + """ + Follow the + https://github.com/dmlc/dgl/blob/1.0.x/python/dgl/distributed/dist_dataloader.py#L116. + In DGL, ``DistDataLoader.expected_idxs`` is the length (number of batches) + of the dataloader. As it uses two dataloader, either one throws an End of Iter error + will stop the dataloader. + + Returns: + -------- + int: The length (number of batches) of the dataloader. + """ return min(self.dataloader.expected_idxs, self.unlabeled_dataloader.expected_idxs) diff --git a/python/graphstorm/dataloading/dataset.py b/python/graphstorm/dataloading/dataset.py index 2915000a38..bb94e5bcbf 100644 --- a/python/graphstorm/dataloading/dataset.py +++ b/python/graphstorm/dataloading/dataset.py @@ -155,29 +155,28 @@ def prepare_batch_edge_input(g, input_edges, return feat class GSgnnData(): - """ The GraphStorm data + """ The GraphStorm data class. Parameters ---------- part_config : str - The path of the partition configuration file. + The path of the partition configuration JSON file. node_feat_field: str or dict of list of str - The fields of the node features that will be encoded by GSNodeInputLayer. + The fields of the node features that will be encoded by ``GSNodeInputLayer``. It's a dict if different node types have different feature names. - Default: None + Default: None. edge_feat_field : str or dict of list of str - The fields of the edge features. - It's a dict if different edge types have - different feature names. - This argument is reserved by future usage. - Default: None + The fields of the edge features. It's a dict, if different edge types have + different feature names. This argument is reserved for future usage when the + ``GSEdgeInputLayer`` is implemented. + Default: None. lm_feat_ntypes : list of str The node types that contains text features. - Default: None + Default: None. lm_feat_etypes : list of tuples The edge types that contains text features. - Default: None + Default: None. """ def __init__(self, part_config, node_feat_field=None, edge_feat_field=None, @@ -258,24 +257,26 @@ def __init__(self, part_config, node_feat_field=None, edge_feat_field=None, @property def g(self): - """ The distributed graph. + """ The distributed graph loaded using information in the given part_config JSON file. """ return self._g @property def graph_name(self): - """ The graph name + """ The distributed graph's name extracted from the given part_config JSON file. """ return self._graph_name @property def node_feat_field(self): - """The field of node feature""" + """ The fields of node features given in initialization. + """ return self._node_feat_field @property def edge_feat_field(self): - """the field of edge feature""" + """ The fields of edge features given in initialization. + """ return self._edge_feat_field def _check_node_feats(self, node_feat_field): @@ -308,7 +309,7 @@ def has_node_feats(self, ntype): Returns ------- - bool : whether the node type has features. + bool : Whether the node type has features. """ if isinstance(self.node_feat_field, str): return True @@ -323,11 +324,11 @@ def has_edge_feats(self, etype): Parameters ---------- etype : (str, str, str) - The canonical edge type + The canonical edge type. Returns ------- - bool : whether the edge type has features + bool : Whether the edge type has features. """ if isinstance(self.edge_feat_field, str): return True @@ -342,11 +343,11 @@ def has_node_lm_feats(self, ntype): Parameters ---------- ntype : str - The node type + The node type. Returns ------- - bool : whether the node type has features. + bool : Whether the node type has text features. """ return ntype in self._lm_feat_ntypes @@ -356,23 +357,24 @@ def has_edge_lm_feats(self, etype): Parameters ---------- etype : (str, str, str) - The edge type + The edge type. Returns ------- - bool : whether the node type has features. + bool : Whether the edge type has text features. """ return etype in self._lm_feat_etypes def get_node_feats(self, input_nodes, nfeat_fields, device='cpu'): - """ Get the node features + """ Get the node features of the given input nodes. The feature fields are defined + in ``nfeat_fields``. Parameters ---------- input_nodes : Tensor or dict of Tensors - The input node IDs - nfeat_fields : str or dict of list - The node features to collect from graph + The input node IDs. + nfeat_fields : str or dict of [str ...] + The node feature fields to be extracted. device : Pytorch device The device where the returned node features are stored. @@ -390,14 +392,15 @@ def get_node_feats(self, input_nodes, nfeat_fields, device='cpu'): feat_field=nfeat_fields) def get_edge_feats(self, input_edges, efeat_fields, device='cpu'): - """ Get the edge features + """ Get the edge features of the given input edges. The feature fields are defined + in ``efeat_fields``. Parameters ---------- input_edges : Tensor or dict of Tensors - The input edge IDs + The input edge IDs. efeat_fields: str or dict of [str ..] - The edge data fields that stores the edge features to retrieve + The edge feature fields to be extracted. device : Pytorch device The device where the returned edge features are stored. @@ -473,15 +476,15 @@ def _check_node_mask(self, ntypes, masks): return masks def get_unlabeled_node_set(self, train_idxs, mask="train_mask"): - """ Collect nodes not used for training. + """ Get node indexes not having the given mask in the training set. Parameters __________ - train_idxs: dict + train_idxs: dict of Tensor The training set. mask: str or list of str - The node feature field storing the training mask. - Default: "train_mask" + The node feature fields storing the training mask. + Default: "train_mask". Returns ------- @@ -510,19 +513,19 @@ def get_unlabeled_node_set(self, train_idxs, mask="train_mask"): return unlabeled_idxs def get_node_train_set(self, ntypes, mask="train_mask"): - """ Get node training set for nodes of ntypes. + """ Get the training set for the given node types under the given mask. Parameters __________ ntypes: str or list of str Node types to get the training set. mask: str or list of str - The node feature field storing the training mask. - Default: "train_mask" + The node feature fields storing the training mask. + Default: "train_mask". Returns ------- - dict of Tensors : The returned training node indexes + dict of Tensors : The returned training node indexes. """ g = self._g pb = g.get_partition_book() @@ -584,19 +587,19 @@ def _get_node_set(self, ntypes, mask): return idxs, num_data def get_node_val_set(self, ntypes, mask="val_mask"): - """ Get node validation set for nodes of ntypes. + """ Get the validation set for the given node types under the given mask. Parameters __________ ntypes: str or list of str Node types to get the validation set. mask: str or list of str - The node feature field storing the validation mask. - Default: "val_mask" + The node feature fields storing the validation mask. + Default: "val_mask". Returns ------- - dict of Tensors : The returned validation node indexes + dict of Tensors : The returned validation node indexes. """ idxs, num_data = self._get_node_set(ntypes, mask) logging.info('part %d, val %d', get_rank(), num_data) @@ -604,19 +607,19 @@ def get_node_val_set(self, ntypes, mask="val_mask"): return idxs def get_node_test_set(self, ntypes, mask="test_mask"): - """ Get node test set for nodes of ntypes. + """ Get the test set for the given node types under the given mask. Parameters __________ ntypes: str or list of str Node types to get the test set. mask: str or list of str - The node feature field storing the test mask. - Default: "test_mask" + The node feature fields storing the test mask. + Default: "test_mask". Returns ------- - dict of Tensors : The returned test node indexes + dict of Tensors : The returned test node indexes. """ idxs, num_data = self._get_node_set(ntypes, mask) logging.info('part %d, test %d', get_rank(), num_data) @@ -624,19 +627,19 @@ def get_node_test_set(self, ntypes, mask="test_mask"): return idxs def get_node_infer_set(self, ntypes, mask="test_mask"): - """ Get node set for inference. + """ Get inference node set for the given node types under the given mask. - If the mask exists in g.nodes[ntype].data, the inference set + If the mask exists in ``g.nodes[ntype].data``, the inference set is collected based on the mask. - If not, the entire node set are treated as the inference set. + If not exist, the entire node set are treated as the inference set. Parameters __________ ntypes: str or list of str Node types to get the inference set. mask: str or list of str - The node feature field storing the inference mask. - Default: "test_mask" + The node feature fields storing the inference mask. + Default: "test_mask". Returns ------- @@ -738,20 +741,20 @@ def _exclude_reverse_etype(self, etypes, reverse_edge_types_map=None): def get_edge_train_set(self, etypes=None, mask="train_mask", reverse_edge_types_map=None): - """ Get edge training set for edges of etypes. + """ Get the training set for the given edge types under the given mask. Parameters __________ etypes: list of str List of edge types to get the training set. If set to None, all the edge types are included. - Default: None + Default: None. mask: str or list of str - The edge feature field storing the training mask. - Default: "train_mask" - reverse_edge_types_map: dict - A map for reverse edge type. - Default: None + The edge feature fields storing the training mask. + Default: "train_mask". + reverse_edge_types_map: dict of tupeles + A map for reverse edge types in the format of {(edge type):(reversed edge type)}. + Default: None. Returns ------- @@ -820,7 +823,7 @@ def _get_edge_set(self, etypes, mask, reverse_edge_types_map): def get_edge_val_set(self, etypes=None, mask="val_mask", reverse_edge_types_map=None): - """ Get edge validation set for edges of etypes. + """ Get the validation set for the given edge types under the given mask. Parameters __________ @@ -829,13 +832,14 @@ def get_edge_val_set(self, etypes=None, mask="val_mask", If set to None, all the edge types are included. mask: str or list of str The edge feature field storing the val mask. - Default: "val_mask" + Default: "val_mask". reverse_edge_types_map: dict - A map for reverse edge type. + A map for reverse edge types in the format of {(edge type):(reversed edge type)}. + Default: None. Returns ------- - dict of Tensors : The returned validation edge indexes + dict of Tensors : The returned validation edge indexes. """ idxs, num_data = self._get_edge_set(etypes, mask, reverse_edge_types_map) logging.info('part %d, val %d', get_rank(), num_data) @@ -844,7 +848,7 @@ def get_edge_val_set(self, etypes=None, mask="val_mask", def get_edge_test_set(self, etypes=None, mask="test_mask", reverse_edge_types_map=None): - """ Get edge test set for edges of etypes. + """ Get the test set for the given edge types under the given mask. Parameters __________ @@ -853,9 +857,10 @@ def get_edge_test_set(self, etypes=None, mask="test_mask", If set to None, all the edge types are included. mask: str or list of str The edge feature field storing the test mask. - Default: "test_mask" + Default: "test_mask". reverse_edge_types_map: dict - A map for reverse edge type. + A map for reverse edge types in the format of {(edge type):(reversed edge type)}. + Default: None. Returns ------- @@ -867,27 +872,28 @@ def get_edge_test_set(self, etypes=None, mask="test_mask", return idxs def get_edge_infer_set(self, etypes=None, mask="test_mask", reverse_edge_types_map=None): - """ Get edge set for inference. + """ Get the inference set for the given edge types under the given mask. - If the mask exists in g.edges[etype].data, the inference set + If the mask exists in ``g.edges[etype].data``, the inference set is collected based on the mask. - If not, the entire edge set are treated as the inference set. + If not exist, the entire edge set are treated as the inference set. Parameters __________ etypes: list of str List of edge types to get the inference set. If set to None, all the edge types are included. - Default: None + Default: None. mask: str or list of str The edge feature field storing the inference mask. - Default: "test_mask" + Default: "test_mask". reverse_edge_types_map: dict - A map for reverse edge type. + A map for reverse edge types in the format of {(edge type):(reversed edge type)}. + Default: None. Returns ------- - dict of Tensors : The returned inference edge indexes + dict of Tensors : The returned inference edge indexes. """ g = self._g pb = g.get_partition_book() diff --git a/python/graphstorm/trainer/lp_trainer.py b/python/graphstorm/trainer/lp_trainer.py index 5308a75204..182e5feecd 100644 --- a/python/graphstorm/trainer/lp_trainer.py +++ b/python/graphstorm/trainer/lp_trainer.py @@ -182,11 +182,11 @@ def fit(self, train_loader, num_epochs, input_nodes = {pos_graph.ntypes[0]: input_nodes} nfeat_fields = train_loader.node_feat_fields input_feats = data.get_node_feats(input_nodes, nfeat_fields, device) - if train_loader.pos_graph_feat_fields is not None: + if train_loader.pos_graph_edge_feat_fields is not None: input_edges = {etype: pos_graph.edges[etype].data[dgl.EID] \ for etype in pos_graph.canonical_etypes} pos_graph_feats = data.get_edge_feats(input_edges, - train_loader.pos_graph_feat_fields, + train_loader.pos_graph_edge_feat_fields, device) else: pos_graph_feats = None diff --git a/python/graphstorm/trainer/mt_trainer.py b/python/graphstorm/trainer/mt_trainer.py index 630e70235d..a9e13ba0f8 100644 --- a/python/graphstorm/trainer/mt_trainer.py +++ b/python/graphstorm/trainer/mt_trainer.py @@ -180,11 +180,11 @@ def prepare_link_predict_mini_batch(data, task_info, mini_batch, device): nfeat_fields = task_info.dataloader.node_feat_fields node_feats = data.get_node_feats(input_nodes, nfeat_fields, device) - if task_info.dataloader.pos_graph_feat_fields is not None: + if task_info.dataloader.pos_graph_edge_feat_fields is not None: input_edges = {etype: pos_graph.edges[etype].data[dgl.EID] \ for etype in pos_graph.canonical_etypes} pos_graph_feats = data.get_edge_feats(input_edges, - task_info.dataloader.pos_graph_feat_fields, + task_info.dataloader.pos_graph_edge_feat_fields, device) else: pos_graph_feats = None