From 11987822eb35549776b7347d950930d33ac31846 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Mon, 4 Dec 2023 10:21:37 -0800 Subject: [PATCH 01/17] Implement hard negative samplers --- python/graphstorm/config/argument.py | 121 +++++++++++++- python/graphstorm/dataloading/dataloading.py | 18 ++- python/graphstorm/dataloading/sampler.py | 161 +++++++++++++++++++ 3 files changed, 296 insertions(+), 4 deletions(-) diff --git a/python/graphstorm/config/argument.py b/python/graphstorm/config/argument.py index a5d58c88ce..e8416d1406 100644 --- a/python/graphstorm/config/argument.py +++ b/python/graphstorm/config/argument.py @@ -1874,9 +1874,128 @@ def lp_edge_weight_for_loss(self): return None + @property + def train_hard_edge_dstnode_negative(self): + """ The list of canonical etypes that have hard negative sets + + The format of the arguement should be: + train_hard_edge_dstnode_negative: + - src_type,rel_type0,dst_type:negative_nid_field + - src_type,rel_type1,dst_type:negative_nid_field + Each edge type can have different fields storing the hard negatives. + + or + train_hard_edge_dstnode_negative: + - negative_nid_field + All the edge types use the same filed storing the hard negatives. + """ + # pylint: disable=no-member + if hasattr(self, "_train_hard_edge_dstnode_negative"): + assert self.task_type == BUILTIN_TASK_LINK_PREDICTION, \ + "Hard negative only works with link prediction" + hard_negatives = self._train_hard_edge_dstnode_negative + if len(hard_negatives) == 1 and \ + ":" not in hard_negatives[0]: + # global feat_name + return hard_negatives[0] + + # per edge type feature + hard_negative_dict = {} + for hard_negative in hard_negatives: + negative_info = hard_negative.split(":") + etype = tuple(negative_info[0].split(",")) + assert etype not in hard_negative_dict, \ + f"You already specify the fixed negative of {etype} " \ + f"as {hard_negative_dict[etype]}" + + hard_negative_dict[etype] = negative_info[1] + return hard_negative_dict + + # By default fixed negative is not used + return None + + @property + def num_hard_negatives(self): + """ Number of hard negatives per edge type + + The format of the arguement should be: + num_hard_negatives: + - src_type,rel_type0,dst_type:num_negatives + - src_type,rel_type1,dst_type:num_negatives + Each edge type can have different number of hard negatives. + + or + num_hard_negatives: + - num_negatives + All the edge types use the same number of hard negatives. + """ + # pylint: disable=no-member + if hasattr(self, "_num_hard_negatives"): + assert self.task_type == BUILTIN_TASK_LINK_PREDICTION, \ + "Hard negative only works with link prediction" + num_negatives = self._num_hard_negatives + if len(num_negatives) == 1 and \ + ":" not in num_negatives[0]: + # global feat_name + return int(num_negatives[0]) + + # per edge type feature + num_hard_negative_dict = {} + for num_negative in num_negatives: + negative_info = num_negative.split(":") + etype = tuple(negative_info[0].split(",")) + assert etype not in num_hard_negative_dict, \ + f"You already specify the fixed negative of {etype} " \ + f"as {num_hard_negative_dict[etype]}" + + num_hard_negative_dict[etype] = int(negative_info[1]) + return num_hard_negative_dict + + return None + + @property + def eval_fixed_edge_dstnode_negative(self): + """ The list of canonical etypes that have predefined negative sets + + The format of the arguement should be: + eval_fixed_edge_dstnode_negative: + - src_type,rel_type0,dst_type:negative_nid_field + - src_type,rel_type1,dst_type:negative_nid_field + Each edge type can have different fields storing the fixed negatives. + + or + eval_fixed_edge_dstnode_negative: + - negative_nid_field + All the edge types use the same filed storing the fixed negatives. + """ + # pylint: disable=no-member + if hasattr(self, "_eval_fixed_edge_dstnode_negative"): + assert self.task_type == BUILTIN_TASK_LINK_PREDICTION, \ + "Fixed negative only works with link prediction" + fixed_negatives = self._eval_fixed_edge_dstnode_negative + if len(fixed_negatives) == 1 and \ + ":" not in fixed_negatives[0]: + # global feat_name + return fixed_negatives[0] + + # per edge type feature + fixed_negative_dict = {} + for fixed_negative in fixed_negatives: + negative_info = fixed_negative.split(":") + etype = tuple(negative_info[0].split(",")) + assert etype not in fixed_negative_dict, \ + f"You already specify the fixed negative of {etype} " \ + f"as {fixed_negative_dict[etype]}" + + fixed_negative_dict[etype] = negative_info[1] + return fixed_negative_dict + + # By default fixed negative is not used + return None + @property def train_etype(self): - """ The list of canonical etype that will be added as + """ The list of canonical etypes that will be added as training target with the target e type(s) If not provided, all edge types will be used as training target. diff --git a/python/graphstorm/dataloading/dataloading.py b/python/graphstorm/dataloading/dataloading.py index f367219dde..afa1a7bf5c 100644 --- a/python/graphstorm/dataloading/dataloading.py +++ b/python/graphstorm/dataloading/dataloading.py @@ -33,7 +33,8 @@ JointLocalUniform, InbatchJointUniform, FastMultiLayerNeighborSampler, - DistributedFileSampler) + DistributedFileSampler, + GSHardEdgeDstNegative) from .utils import trim_data, modify_fanout_for_target_etype from .dataset import GSDistillData @@ -972,9 +973,11 @@ class GSgnnLinkPredictionTestDataLoader(): When test is huge, using fixed_test_size can save validation and test time. Default: None. + fixed_edge_dst_negative_field: str or list of str + The feature field(s) that store the fixed negative set for each edge. """ - def __init__(self, dataset, target_idx, batch_size, num_negative_edges, fanout=None, - fixed_test_size=None): + def __init__(self, dataset, target_idx, batch_size, num_negative_edges, + fanout=None, fixed_test_size=None, fixed_edge_dst_negative_field=None): self._data = dataset self._fanout = fanout for etype in target_idx: @@ -991,6 +994,7 @@ def __init__(self, dataset, target_idx, batch_size, num_negative_edges, fanout=N "is %d, which is smaller than the expected" "test size %d, force it to %d", etype, len(t_idx), self._fixed_test_size[etype], len(t_idx)) + self._fixed_edge_dst_negative_field = fixed_edge_dst_negative_field self._negative_sampler = self._prepare_negative_sampler(num_negative_edges) self._reinit_dataset() @@ -1009,6 +1013,10 @@ def _prepare_negative_sampler(self, num_negative_edges): # the default negative sampler is uniform sampler self._neg_sample_type = BUILTIN_LP_UNIFORM_NEG_SAMPLER negative_sampler = GlobalUniform(num_negative_edges) + if self._fixed_edge_dst_negative_field: + negative_sampler = GSHardEdgeDstNegative(num_negative_edges, + self._fixed_edge_dst_negative_field, + negative_sampler) return negative_sampler def __iter__(self): @@ -1057,6 +1065,10 @@ def _prepare_negative_sampler(self, num_negative_edges): # the default negative sampler is uniform sampler negative_sampler = JointUniform(num_negative_edges) self._neg_sample_type = BUILTIN_LP_JOINT_NEG_SAMPLER + if self._fixed_edge_dst_negative_field: + negative_sampler = GSHardEdgeDstNegative(num_negative_edges, + self._fixed_edge_dst_negative_field, + negative_sampler) return negative_sampler ################ Minibatch DataLoader (Node classification) ####################### diff --git a/python/graphstorm/dataloading/sampler.py b/python/graphstorm/dataloading/sampler.py index 53b9d46794..f474a71cc1 100644 --- a/python/graphstorm/dataloading/sampler.py +++ b/python/graphstorm/dataloading/sampler.py @@ -70,6 +70,167 @@ def _generate(self, g, eids, canonical_etype): dst = F.randint(shape, dtype, ctx, 0, self._local_neg_nids[vtype].shape[0]) return src, self._local_neg_nids[vtype][dst] +class GSHardEdgeDstNegative(object): + """ GraphStorm negativer sampler that chooses negative destination nodes + from a fixed set to create negative edges. + """ + def __init__(self, k, dst_negative_field, negative_sampler, num_hard_negs=None): + self._dst_negative_field = dst_negative_field + self._k = k + self._negative_sampler = negative_sampler + self._num_hard_negs = num_hard_negs + + def _generate(self, g, eids, canonical_etype): + if isinstance(self._dst_negative_field, str): + dst_negative_field = self._dst_negative_field + elif canonical_etype in self._dst_negative_field: + dst_negative_field = self._dst_negative_field[canonical_etype] + else: + dst_negative_field = None + + if isinstance(self._num_hard_negs, int): + required_num_hard_neg = self._num_hard_negs + elif canonical_etype in self._num_hard_negs: + required_num_hard_neg = self._num_hard_negs[canonical_etype] + else: + required_num_hard_neg = 0 + + if dst_negative_field is None or required_num_hard_neg == 0: + # no hard negative, fallback to random negative + return self._negative_sampler._generate(g, eids, canonical_etype) + + hard_negatives = g.edges[canonical_etype].data[dst_negative_field][eids] + # It is possible that different edges may have different number of + # pre-defined negatives. For pre-defined negatives, the corresponding + # value in `hard_negatives` will be integers representing the node ids. + # For others, they will be -1s meaning there are missing fixed negatives. + if th.sum(hard_negatives == -1) == 0: + # Fast track, there is no -1 in hard_negatives + max_num_hard_neg = hard_negatives.shape[1] + neg_idx = th.randperm(max_num_hard_neg) + hard_negatives = hard_negatives[:,neg_idx] + if required_num_hard_neg >= self._k and max_num_hard_neg >= self._k: + # All negative should be hard negative and + # there are enough hard negatives. + src, _ = g.find_edges(eids, etype=canonical_etype) + src = F.repeat(src, self.k, 0) + return src, hard_negatives.reshape((-1,)) + else: + if required_num_hard_neg < max_num_hard_neg: + # Only need required_num_hard_neg hard negatives. + hard_negatives = hard_negatives[:,:required_num_hard_neg] + num_hard_neg = required_num_hard_neg + else: + # There is not enough hard negative to fill required_num_hard_neg + num_hard_neg = max_num_hard_neg + + # There is not enough negatives + src, neg = self._negative_sampler._generate(g, eids, canonical_etype) + # replace random negatives with fixed negatives + neg[:,:num_hard_neg] = hard_negatives + return src, neg + else: + # slow track, we need to handle cases when there are -1s + hard_negatives, _ = th.sort(dim=1, descending=True) + + src, neg = self._negative_sampler._generate(g, eids, canonical_etype) + for i in range(len(eids)): + hard_negative = hard_negatives[i] + # ignore -1s + hard_negative = hard_negative[hard_negative > -1] + max_num_hard_neg = hard_negative.shape[0] + hard_negative = hard_negative[th.randperm(max_num_hard_neg)] + + if required_num_hard_neg < max_num_hard_neg: + # Only need required_num_hard_neg hard negatives. + hard_negative = hard_negative[:required_num_hard_neg] + num_hard_neg = required_num_hard_neg + else: + num_hard_neg = max_num_hard_neg + + # replace random negatives with fixed negatives + neg[i*self._k:i*self._k + num_hard_neg \ + if num_hard_neg < self._k else self._k] = \ + hard_negative[:num_hard_neg if num_hard_neg < self._k else self._k] + return src, neg + + def gen_neg_pairs(self, g, pos_pairs): + """ Returns negative examples associated with positive examples. + It only return dst negatives. + + Parameters + ---------- + g : DGLGraph + The graph. + pos_pairs : (Tensor, Tensor) or dict[etype, (Tensor, Tensor)] + The positive node pairs + + Returns + ------- + tuple[Tensor, Tensor, Tensor, Tensor] or + dict[etype, tuple(Tensor, Tensor Tensor, Tensor) + The returned [positive source, negative source, + postive destination, negatve destination] + tuples as pos-neg examples. + """ + def _gen_neg_pair(pos_pair, canonical_etype): + src, pos_dst = pos_pair + eids = g.edge_ids(src, pos_dst, etype=canonical_etype) + + if isinstance(self._dst_negative_field, str): + dst_negative_field = self._dst_negative_field + elif canonical_etype in self._dst_negative_field: + dst_negative_field = self._dst_negative_field[canonical_etype] + else: + dst_negative_field = None + + if dst_negative_field is None: + src, _, pos_dst, neg_dst = \ + self._negative_sampler.gen_neg_pairs(g, {canonical_etype:pos_pair}) + return (src, None, pos_dst, neg_dst) + + hard_negatives = g.edges[canonical_etype].data[dst_negative_field][eids] + # It is possible that different edges may have different number of + # pre-defined negatives. For pre-defined negatives, the corresponding + # value in `hard_negatives` will be integers representing the node ids. + # For others, they will be -1s meaning there are missing fixed negatives. + if th.sum(hard_negatives == -1) == 0: + # Fast track, there is no -1 in hard_negatives + num_hard_neg = hard_negatives.shape[1] + if self._k < num_hard_neg: + hard_negatives = hard_negatives[:self._k] + return (src, None, pos_dst, hard_negatives) + else: + # random negative are needed + src, _, pos_dst, neg_dst = \ + self._negative_sampler.gen_neg_pairs(g, {canonical_etype:pos_pair}) + neg_dst[:,:num_hard_neg] = hard_negatives + return (src, None, pos_dst, neg_dst) + else: + # slow track, we need to handle cases when there are -1s + hard_negatives, _ = th.sort(dim=1, descending=True) + + src, _, pos_dst, neg_dst = \ + self._negative_sampler.gen_neg_pairs(g, {canonical_etype:pos_pair}) + for i in range(len(eids)): + hard_negative = hard_negatives[i] + # ignore -1s + hard_negative = hard_negative[hard_negative > -1] + num_hard_neg = hard_negative.shape[0] + neg_dst[:num_hard_neg if num_hard_neg < self._k else self._k] = \ + hard_negative[:num_hard_neg if num_hard_neg < self._k else self._k] + return (src, _, pos_dst, neg_dst) + + if isinstance(pos_pairs, Mapping): + pos_neg_tuple = {} + for canonical_etype, pos_pair in pos_pairs.items(): + pos_neg_tuple[canonical_etype] = _gen_neg_pair(pos_pair, canonical_etype) + else: + assert len(g.canonical_etypes) == 1, \ + 'please specify a dict of etypes and ids for graphs with multiple edge types' + pos_neg_tuple = _gen_neg_pair(pos_pairs, canonical_etype) + return pos_neg_tuple + class GlobalUniform(Uniform): """Negative sampler that randomly chooses negative destination nodes for each source node according to a uniform distribution. From bb9416af1c9bf42800859ba3f99062743ab47402 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Tue, 5 Dec 2023 15:24:17 -0800 Subject: [PATCH 02/17] add unitests --- python/graphstorm/dataloading/sampler.py | 27 ++- tests/unit-tests/test_dataloading.py | 223 +++++++++++++++++++++++ 2 files changed, 247 insertions(+), 3 deletions(-) diff --git a/python/graphstorm/dataloading/sampler.py b/python/graphstorm/dataloading/sampler.py index f474a71cc1..d0a2f473df 100644 --- a/python/graphstorm/dataloading/sampler.py +++ b/python/graphstorm/dataloading/sampler.py @@ -28,6 +28,8 @@ from dgl.dataloading import NeighborSampler from dgl.transforms import to_block +from ..utils import is_wholegraph + class LocalUniform(Uniform): """Negative sampler that randomly chooses negative destination nodes for each source node according to a uniform distribution. @@ -73,8 +75,22 @@ def _generate(self, g, eids, canonical_etype): class GSHardEdgeDstNegative(object): """ GraphStorm negativer sampler that chooses negative destination nodes from a fixed set to create negative edges. + + Parameters + ---------- + k: int + Number of negatives to sample. + dst_negative_field: str or dict of str + The field storing the hard negatives. + negative_sampler: sampler + The negative sampler to generate negatives + if there is not enough hard negatives. + num_hard_negs: int or dict of int + Number of hard negatives. """ def __init__(self, k, dst_negative_field, negative_sampler, num_hard_negs=None): + assert is_wholegraph() is False, \ + "Hard negative is not supported for WholeGraph." self._dst_negative_field = dst_negative_field self._k = k self._negative_sampler = negative_sampler @@ -108,12 +124,16 @@ def _generate(self, g, eids, canonical_etype): # Fast track, there is no -1 in hard_negatives max_num_hard_neg = hard_negatives.shape[1] neg_idx = th.randperm(max_num_hard_neg) + # shuffle the hard negatives hard_negatives = hard_negatives[:,neg_idx] + + print(f"{required_num_hard_neg} {max_num_hard_neg} {self._k}") if required_num_hard_neg >= self._k and max_num_hard_neg >= self._k: # All negative should be hard negative and # there are enough hard negatives. + hard_negatives = hard_negatives[:,:self._k] src, _ = g.find_edges(eids, etype=canonical_etype) - src = F.repeat(src, self.k, 0) + src = F.repeat(src, self._k, 0) return src, hard_negatives.reshape((-1,)) else: if required_num_hard_neg < max_num_hard_neg: @@ -127,8 +147,9 @@ def _generate(self, g, eids, canonical_etype): # There is not enough negatives src, neg = self._negative_sampler._generate(g, eids, canonical_etype) # replace random negatives with fixed negatives - neg[:,:num_hard_neg] = hard_negatives - return src, neg + neg = neg.reshape(-1, self._k) + neg[:,:num_hard_neg] = hard_negatives[:,:num_hard_neg] + return src, neg.reshape((-1,)) else: # slow track, we need to handle cases when there are -1s hard_negatives, _ = th.sort(dim=1, descending=True) diff --git a/tests/unit-tests/test_dataloading.py b/tests/unit-tests/test_dataloading.py index 18613e2314..ab2c6849aa 100644 --- a/tests/unit-tests/test_dataloading.py +++ b/tests/unit-tests/test_dataloading.py @@ -57,6 +57,8 @@ from graphstorm.dataloading import BUILTIN_LP_JOINT_NEG_SAMPLER from graphstorm.dataloading.sampler import InbatchJointUniform +from graphstorm.dataloading.sampler import GlobalUniform +from graphstorm.dataloading.sampler import GSHardEdgeDstNegative from graphstorm.dataloading.dataset import (prepare_batch_input, prepare_batch_edge_input) @@ -1330,6 +1332,226 @@ def test_lp_dataloader_len(batch_size): device='cuda:0', train_task=True) assert len(dataloader) == len(list(dataloader)) +def test_hard_edge_dst_negative_sample_generate_complex_case(): + num_nodes = 100 + # test GSHardEdgeDstNegative._generate when all some pos edges do not have enough hard negatives defined + num_negs = 10 + etype0 = ("n0", "r0", "n1") + etype1 = ("n0", "r0", "n2") + etype2 = ("n0", "r0", "n3") + src = th.arange(num_nodes) + dst = th.arange(num_nodes) + hard0 = th.randint(num_nodes, (num_nodes, 4)) # not enough hard negatives + hard0[0] = th.randperm(num_nodes)[:4] + hard0[0][-1] = -1 + hard0[0][-2] = -1 + hard0[1] = th.randperm(num_nodes)[:4] + hard0[1][-1] = -1 + hard1 = th.randint(num_nodes, (num_nodes, num_negs)) + hard1[0] = th.randperm(num_nodes)[:10] + hard1[0][-1] = -1 + hard1[0][-2] = -1 + hard1[1] = th.randperm(num_nodes)[:10] + hard1[1][-1] = -1 + hard2 = th.randint(num_nodes, (num_nodes, num_negs*2)) # more hard negatives than num neg + hard2[0] = th.randperm(num_nodes)[:10] + hard2[0][-1] = -1 + hard2[0][-2] = -1 + hard2[1] = th.randperm(num_nodes)[:10] + hard2[1][-1] = -1 + g = dgl.heterograph({ + etype0: (src, dst), + etype1: (src, dst), + etype2: (src, dst), + }) + g.edges[etype0].data["hard_negative"] = hard0 + g.edges[etype1].data["hard_negative"] = hard1 + g.edges[etype2].data["hard_negative"] = hard2 + + num_edges = 10 + eids = th.arange(num_edges) + def check_less_hard_negs(hard_neg_sampler, target_etype, hard_neg_data, num_hard_neg): + + + +@pytest.mark.parametrize("num_nodes", 100) +def test_hard_edge_dst_negative_sample_generate(num_nodes): + # test GSHardEdgeDstNegative._generate with fast track when all pos edges have enough hard negatives defined + num_negs = 10 + etype0 = ("n0", "r0", "n1") + etype1 = ("n0", "r0", "n2") + etype2 = ("n0", "r0", "n3") + src = th.arange(num_nodes) + dst = th.arange(num_nodes) + hard0 = th.randint(num_nodes, (num_nodes, 4)) # not enough hard negatives + hard1 = th.randint(num_nodes, (num_nodes, num_negs)) + hard2 = th.randint(num_nodes, (num_nodes, num_negs*2)) # more hard negatives than num neg + g = dgl.heterograph({ + etype0: (src, dst), + etype1: (src, dst), + etype2: (src, dst), + }) + + g.edges[etype0].data["hard_negative"] = hard0 + g.edges[etype1].data["hard_negative"] = hard1 + g.edges[etype2].data["hard_negative"] = hard2 + + num_edges = 10 + eids = th.arange(num_edges) + def check_less_hard_negs(hard_neg_sampler, target_etype, hard_neg_data, num_hard_neg): + neg_src, neg_dst = hard_neg_sampler._generate(g, eids, target_etype) + assert len(neg_src) == num_edges * num_negs + assert len(neg_dst) == num_edges * num_negs + assert_equal(th.repeat_interleave(src[:10], num_negs, 0).numpy(), neg_src.numpy()) + neg_dst = neg_dst.reshape(num_edges, num_negs) + for i in range(num_edges): + hard_neg_dst = neg_dst[i][:num_hard_neg] + hard_neg_dst = set(hard_neg_dst.tolist()) + rand_neg_dst = neg_dst[i][num_hard_neg:] + rand_neg_dst = set(rand_neg_dst.tolist()) + hard_neg_set = set(hard_neg_data[i].tolist()) + print(hard_neg_dst) + print(hard_neg_set) + assert hard_neg_dst.issubset(hard_neg_set) + assert rand_neg_dst.issubset(hard_neg_set) is False + + # case 1: + # 1. hard_negative field is string + # 2. num_hard_neg is int + # 3. num_negs > number of hard negatives required (2) + # 4. num_negs > total number of hard negatives + # provided (hard0 has 4 negatives for each node) + # 5. fast track + # + # expected behavior: + # 1. Only 2 hard negatives are returned + # 2. Others will be random negatives + sampler = GlobalUniform(num_negs) + hard_sampler = GSHardEdgeDstNegative(num_negs, "hard_negative", sampler, num_hard_negs=2) + check_less_hard_negs(hard_sampler, etype0, hard0, 2) + + # Case 2: + # 1. hard_negative field is string + # 2. num_hard_neg is int + # 3. num_negs > number of hard negatives required (2) + # 4. num_negs == total number of hard negatives + # provided (hard1 has 10 negatives for each node) + # 5. fast track + # + # expected behavior: + # 1. Only 2 hard negatives are returned + # 2. Others will be random negatives + check_less_hard_negs(hard_sampler, etype1, hard1, 2) + + # Case 3: + # 1. hard_negative field is string + # 2. num_hard_neg is int + # 3. num_negs > number of hard negatives required (8) + # 4. number of hard negatives required (8) > number of hard negatives + # provided (hard0 has only 4 negatives for each node) + # 5.fast track + # + # expected behavior: + # 1. Only 4 hard negatives are returned + # 2. Others will be random negatives + hard_sampler = GSHardEdgeDstNegative(num_negs, "hard_negative", sampler, num_hard_negs=8) + check_less_hard_negs(hard_sampler, etype2, hard0, 4) + + # Case 4: + # 1. hard_negative field is string + # 2. num_hard_neg is int + # 3. num_negs == number of hard negatives required (10) + # 4. number of hard negatives required (8) == number of hard negatives + # provided (hard1 has 10 negatives for each node) + # 5.fast track + # + # expected behavior: + # 1. Equal negatives + def check_enough_hard_negs(hard_neg_sampler, target_etype, hard_neg_data): + neg_src, neg_dst = hard_neg_sampler._generate(g, eids, target_etype) + assert len(neg_src) == num_edges * num_negs + assert len(neg_dst) == num_edges * num_negs + assert_equal(th.repeat_interleave(src[:10], num_negs, 0).numpy(), neg_src.numpy()) + neg_dst = neg_dst.reshape(num_edges, num_negs) + for i in range(num_edges): + hard_neg_dst = set(neg_dst[i].tolist()) + hard_neg_set = set(hard_neg_data[i].tolist()) + print(i) + print(hard_neg_dst) + print(hard_neg_set) + assert hard_neg_dst == hard_neg_set + hard_sampler = GSHardEdgeDstNegative(num_negs, "hard_negative", sampler, num_hard_negs=num_negs) + check_enough_hard_negs(hard_sampler, etype1, hard1) + + # Case 5: + # 1. hard_negative field is string + # 2. num_hard_neg is int + # 3. num_negs == number of hard negatives required (10) + # 4. number of hard negatives required (8) < number of hard negatives + # provided (hard2 has 20 negatives for each node) + # 5.fast track + # + # expected behavior: + # 1. hard negatives will be a subset of hard2 + def check_more_hard_negs(hard_neg_sampler, target_etype, hard_neg_data): + neg_src, neg_dst = hard_neg_sampler._generate(g, eids, target_etype) + assert len(neg_src) == num_edges * num_negs + assert len(neg_dst) == num_edges * num_negs + assert_equal(th.repeat_interleave(src[:10], num_negs, 0).numpy(), neg_src.numpy()) + neg_dst = neg_dst.reshape(num_edges, num_negs) + for i in range(num_edges): + hard_neg_dst = set(neg_dst[i].tolist()) + hard_neg_set = set(hard_neg_data[i].tolist()) + assert hard_neg_dst.issubset(hard_neg_set) + check_more_hard_negs(hard_sampler, etype2, hard2) + + # Case 6: + # hard_negative field is dict + # num_hard_neg is dict + # 3. num_negs > number of hard negatives required (2) + # 4. num_negs > total number of hard negatives + # provided (hard0 has 4 negatives for each node) + # 5. fast track + # + # expected behavior: + # 1. Only 2 hard negatives are returned + # 2. Others will be random negatives + hard_sampler = GSHardEdgeDstNegative( + num_negs, + {etype0: "hard_negative", + etype1: "hard_negative", + etype2: "hard_negative"}, + sampler, + {etype0: 2, + etype1: 2, + etype2: 10}) + check_less_hard_negs(hard_sampler, etype0, hard0, 2) + + # Case 7: + # hard_negative field is dict + # num_hard_neg is dict + # 3. num_negs > number of hard negatives required (2) + # 4. num_negs == total number of hard negatives + # provided (hard1 has 10 negatives for each node) + # 5. fast track + # + # expected behavior: + # 1. Only 2 hard negatives are returned + # 2. Others will be random negatives + check_less_hard_negs(hard_sampler, etype1, hard1, 2) + + # Case 8: + # 1. hard_negative field is string + # 2. num_hard_neg is int + # 3. num_negs == number of hard negatives required (10) + # 4. number of hard negatives required (8) < number of hard negatives + # provided (hard2 has 20 negatives for each node) + # 5.fast track + # + # expected behavior: + # 1. hard negatives will be a subset of hard2 + check_more_hard_negs(hard_sampler, etype2, hard2) + @pytest.mark.parametrize("num_pos", [2, 10]) @pytest.mark.parametrize("num_neg", [5, 20]) def test_inbatch_joint_neg_sampler(num_pos, num_neg): @@ -1356,6 +1578,7 @@ def test_inbatch_joint_neg_sampler(num_pos, num_neg): if __name__ == '__main__': + test_hard_edge_dst_negative_sample_generate(100) test_inbatch_joint_neg_sampler(10, 20) test_np_dataloader_len(11) From 205f010fa96090ce3e2542d8876d951b99b493ca Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Thu, 7 Dec 2023 23:37:18 -0800 Subject: [PATCH 03/17] Update --- python/graphstorm/dataloading/sampler.py | 6 +- tests/unit-tests/test_dataloading.py | 283 +++++++++++++++++++---- 2 files changed, 243 insertions(+), 46 deletions(-) diff --git a/python/graphstorm/dataloading/sampler.py b/python/graphstorm/dataloading/sampler.py index d0a2f473df..fdc1404f4b 100644 --- a/python/graphstorm/dataloading/sampler.py +++ b/python/graphstorm/dataloading/sampler.py @@ -152,7 +152,7 @@ def _generate(self, g, eids, canonical_etype): return src, neg.reshape((-1,)) else: # slow track, we need to handle cases when there are -1s - hard_negatives, _ = th.sort(dim=1, descending=True) + hard_negatives, _ = th.sort(hard_negatives, dim=1, descending=True) src, neg = self._negative_sampler._generate(g, eids, canonical_etype) for i in range(len(eids)): @@ -170,8 +170,8 @@ def _generate(self, g, eids, canonical_etype): num_hard_neg = max_num_hard_neg # replace random negatives with fixed negatives - neg[i*self._k:i*self._k + num_hard_neg \ - if num_hard_neg < self._k else self._k] = \ + neg[i*self._k:i*self._k + (num_hard_neg \ + if num_hard_neg < self._k else self._k)] = \ hard_negative[:num_hard_neg if num_hard_neg < self._k else self._k] return src, neg diff --git a/tests/unit-tests/test_dataloading.py b/tests/unit-tests/test_dataloading.py index ab2c6849aa..52260e7042 100644 --- a/tests/unit-tests/test_dataloading.py +++ b/tests/unit-tests/test_dataloading.py @@ -1332,33 +1332,18 @@ def test_lp_dataloader_len(batch_size): device='cuda:0', train_task=True) assert len(dataloader) == len(list(dataloader)) -def test_hard_edge_dst_negative_sample_generate_complex_case(): - num_nodes = 100 - # test GSHardEdgeDstNegative._generate when all some pos edges do not have enough hard negatives defined - num_negs = 10 +def _create_hard_neg_graph(num_nodes, num_negs): etype0 = ("n0", "r0", "n1") etype1 = ("n0", "r0", "n2") etype2 = ("n0", "r0", "n3") src = th.arange(num_nodes) dst = th.arange(num_nodes) - hard0 = th.randint(num_nodes, (num_nodes, 4)) # not enough hard negatives - hard0[0] = th.randperm(num_nodes)[:4] - hard0[0][-1] = -1 - hard0[0][-2] = -1 - hard0[1] = th.randperm(num_nodes)[:4] - hard0[1][-1] = -1 + # each edge has 4 pre-defined hard negatives + hard0 = th.randint(num_nodes, (num_nodes, 4)) + # each edge has 10 pre-defined hard negatives hard1 = th.randint(num_nodes, (num_nodes, num_negs)) - hard1[0] = th.randperm(num_nodes)[:10] - hard1[0][-1] = -1 - hard1[0][-2] = -1 - hard1[1] = th.randperm(num_nodes)[:10] - hard1[1][-1] = -1 + # each edge has 20 pre-defined hard negatives hard2 = th.randint(num_nodes, (num_nodes, num_negs*2)) # more hard negatives than num neg - hard2[0] = th.randperm(num_nodes)[:10] - hard2[0][-1] = -1 - hard2[0][-2] = -1 - hard2[1] = th.randperm(num_nodes)[:10] - hard2[1][-1] = -1 g = dgl.heterograph({ etype0: (src, dst), etype1: (src, dst), @@ -1368,33 +1353,197 @@ def test_hard_edge_dst_negative_sample_generate_complex_case(): g.edges[etype1].data["hard_negative"] = hard1 g.edges[etype2].data["hard_negative"] = hard2 + return etype0, etype1, etype2, hard0, hard1, hard2, src, g + +def test_hard_edge_dst_negative_sample_generate_complex_case(): + num_nodes = 1000 + # test GSHardEdgeDstNegative._generate when all some pos edges do not have enough hard negatives defined + num_negs = 10 + etype0, etype1, etype2, hard0, hard1, hard2, src, g = _create_hard_neg_graph(num_nodes, num_negs) + + # not enough predefined hard negatives + # for hard0[0] and hard0[1] + hard0[0] = th.randperm(num_nodes)[:4] + hard0[0][-1] = -1 + hard0[0][-2] = -1 + hard0[1][-1] = -1 + + # not enough predefined hard negatives + # for hard0[0] and hard0[1] + hard1[0] = th.randperm(num_nodes)[:num_negs] + hard1[1] = th.randperm(num_nodes)[:num_negs] + hard1[0][-1] = -1 + hard1[0][-2] = -1 + hard1[1][-1] = -1 + + # not enough predefined hard negatives + # for hard0[0] and hard0[1] + hard2[0] = th.randperm(num_nodes)[:num_negs*2] + hard2[1] = th.randperm(num_nodes)[:num_negs*2] + hard2[0][-1] = -1 + hard2[0][-2] = -1 + hard2[1][-1] = -1 + num_edges = 10 eids = th.arange(num_edges) - def check_less_hard_negs(hard_neg_sampler, target_etype, hard_neg_data, num_hard_neg): + def test_missing_hard_neg(neg_dst, num_hard_neg, hard_neg_data): + # hardx[0][-1] and hardx[0][-2] is -1, + # which means hardx[0] does not enough predefined negatives + # Random sample will be applied to -1s. + hard_neg_dst = neg_dst[0][:num_hard_neg] + hard_neg_rand_0 = hard_neg_dst[-1] + hard_neg_rand_1 = hard_neg_dst[-2] + hard_neg_dst = set(hard_neg_dst[:-2].tolist()) + rand_neg_dst = neg_dst[0][num_hard_neg:] + rand_neg_dst = set(rand_neg_dst.tolist()) + hard_neg_set = set(hard_neg_data[0].tolist()) + assert hard_neg_dst.issubset(hard_neg_set) + assert len(rand_neg_dst) == 0 or \ + rand_neg_dst.issubset(hard_neg_set) is False + + rand_0_check = hard_neg_rand_0 not in hard_neg_set + rand_1_check = hard_neg_rand_1 not in hard_neg_set + + # hardx[1][-1] is -1, + # which means hardx[0] does not enough predefined negatives + # Random sample will be applied to -1s. + hard_neg_dst = neg_dst[1][:num_hard_neg] + hard_neg_rand_2 = hard_neg_dst[-1] + hard_neg_dst = set(hard_neg_dst[:-1].tolist()) + rand_neg_dst = neg_dst[1][num_hard_neg:] + rand_neg_dst = set(rand_neg_dst.tolist()) + hard_neg_set = set(hard_neg_data[1].tolist()) + assert hard_neg_dst.issubset(hard_neg_set) + assert len(rand_neg_dst) == 0 or \ + rand_neg_dst.issubset(hard_neg_set) is False + + rand_2_check = hard_neg_rand_2 not in hard_neg_set + # The chance is very to to have rand_0_check, + # rand_1_check and rand_2_check be true at the same time + # The change is (4/1000)^3 + assert rand_0_check or rand_1_check or rand_2_check + + def check_less_hard_negs(hard_neg_sampler, target_etype, hard_neg_data, + num_hard_neg, check_missing_hard_neg): + neg_src, neg_dst = hard_neg_sampler._generate(g, eids, target_etype) + assert len(neg_src) == num_edges * num_negs + assert len(neg_dst) == num_edges * num_negs + assert_equal(th.repeat_interleave(src[:10], num_negs, 0).numpy(), neg_src.numpy()) + neg_dst = neg_dst.reshape(num_edges, num_negs) + + if check_missing_hard_neg: + test_missing_hard_neg(neg_dst, num_hard_neg, hard_neg_data) + + start = 2 if check_missing_hard_neg else 0 + for i in range(start, num_edges): + hard_neg_dst = neg_dst[i][:num_hard_neg] + hard_neg_dst = set(hard_neg_dst.tolist()) + rand_neg_dst = neg_dst[i][num_hard_neg:] + rand_neg_dst = set(rand_neg_dst.tolist()) + hard_neg_set = set(hard_neg_data[i].tolist()) + assert hard_neg_dst.issubset(hard_neg_set) + assert rand_neg_dst.issubset(hard_neg_set) is False + + # case 1: + # 1. hard_negative field is string + # 2. num_hard_neg is int + # 3. num_negs > number of hard negatives required (2) + # 4. num_negs > total number of hard negatives + # provided (hard0 has 4 negatives for each node) + # 5. Each edge has enough hard negative even though some edges do not have enough (10) predefined negatives + # 6. slow track (-1 exists in hard neg feature) + # + # expected behavior: + # 1. Only 2 hard negatives are returned + # 2. Others will be random negatives + sampler = GlobalUniform(num_negs) + hard_sampler = GSHardEdgeDstNegative(num_negs, "hard_negative", sampler, num_hard_negs=2) + check_less_hard_negs(hard_sampler, etype0, hard0, 2, check_missing_hard_neg=False) + + # Case 2: + # 1. hard_negative field is string + # 2. num_hard_neg is int + # 3. num_negs > number of hard negatives required (2) + # 4. num_negs == total number of hard negatives + # provided (hard1 has 10 negatives for each node) + # 5. Each edge has enough hard negative even though some edges do not have enough (8) predefined negatives + # 6. slow track (-1 exists in hard neg feature) + # + # expected behavior: + # 1. Only 2 hard negatives are returned + # 2. Others will be random negatives + check_less_hard_negs(hard_sampler, etype1, hard1, 2, check_missing_hard_neg=False) + + # Case 3: + # 1. hard_negative field is string + # 2. num_hard_neg is int + # 3. num_negs > number of hard negatives required (8) + # 4. number of hard negatives required (8) > number of hard negatives + # provided (hard0 has only 4 negatives for each node) + # 5.slow track (-1 exists in hard neg feature) + # + # expected behavior: + # 1. Only 4 hard negatives are returned + # 2. Others will be random negatives + # 3. eid 0 will have 2 more random negatives + # and eid 1 will have 1 more random negatives + hard_sampler = GSHardEdgeDstNegative(num_negs, "hard_negative", sampler, num_hard_negs=8) + check_less_hard_negs(hard_sampler, etype0, hard0, 4, check_missing_hard_neg=True) + + # Case 4: + # 1. hard_negative field is string + # 2. num_hard_neg is int + # 3. num_negs == number of hard negatives required (10) + # 4. number of hard negatives required (8) == number of hard negatives + # provided (hard1 has 10 negatives for each node) + # 5.slow track (-1 exists in hard neg feature) + # + # expected behavior: + # 1. Equal negatives + def check_enough_hard_negs(hard_neg_sampler, target_etype, hard_neg_data): + neg_src, neg_dst = hard_neg_sampler._generate(g, eids, target_etype) + assert len(neg_src) == num_edges * num_negs + assert len(neg_dst) == num_edges * num_negs + assert_equal(th.repeat_interleave(src[:10], num_negs, 0).numpy(), neg_src.numpy()) + neg_dst = neg_dst.reshape(num_edges, num_negs) + test_missing_hard_neg(neg_dst, num_negs, hard_neg_data) + for i in range(2, num_edges): + hard_neg_dst = set(neg_dst[i].tolist()) + hard_neg_set = set(hard_neg_data[i].tolist()) + assert hard_neg_dst == hard_neg_set -@pytest.mark.parametrize("num_nodes", 100) -def test_hard_edge_dst_negative_sample_generate(num_nodes): + hard_sampler = GSHardEdgeDstNegative(num_negs, "hard_negative", sampler, num_hard_negs=num_negs) + check_enough_hard_negs(hard_sampler, etype1, hard1) + + # Case 5: + # 1. hard_negative field is string + # 2. num_hard_neg is int + # 3. num_negs == number of hard negatives required (10) + # 4. number of hard negatives required (8) < number of hard negatives + # provided (hard2 has 20 negatives for each node) + # 5.slow track (-1 exists in hard neg feature) + # + # expected behavior: + # 1. hard negatives will be a subset of hard2 + def check_more_hard_negs(hard_neg_sampler, target_etype, hard_neg_data): + neg_src, neg_dst = hard_neg_sampler._generate(g, eids, target_etype) + assert len(neg_src) == num_edges * num_negs + assert len(neg_dst) == num_edges * num_negs + assert_equal(th.repeat_interleave(src[:10], num_negs, 0).numpy(), neg_src.numpy()) + neg_dst = neg_dst.reshape(num_edges, num_negs) + for i in range(num_edges): + hard_neg_dst = set(neg_dst[i].tolist()) + hard_neg_set = set(hard_neg_data[i].tolist()) + assert hard_neg_dst.issubset(hard_neg_set) + check_more_hard_negs(hard_sampler, etype2, hard2) + +def test_hard_edge_dst_negative_sample_generate(): # test GSHardEdgeDstNegative._generate with fast track when all pos edges have enough hard negatives defined + num_nodes = 100 num_negs = 10 - etype0 = ("n0", "r0", "n1") - etype1 = ("n0", "r0", "n2") - etype2 = ("n0", "r0", "n3") - src = th.arange(num_nodes) - dst = th.arange(num_nodes) - hard0 = th.randint(num_nodes, (num_nodes, 4)) # not enough hard negatives - hard1 = th.randint(num_nodes, (num_nodes, num_negs)) - hard2 = th.randint(num_nodes, (num_nodes, num_negs*2)) # more hard negatives than num neg - g = dgl.heterograph({ - etype0: (src, dst), - etype1: (src, dst), - etype2: (src, dst), - }) - - g.edges[etype0].data["hard_negative"] = hard0 - g.edges[etype1].data["hard_negative"] = hard1 - g.edges[etype2].data["hard_negative"] = hard2 + etype0, etype1, etype2, hard0, hard1, hard2, src, g = _create_hard_neg_graph(num_nodes, num_negs) num_edges = 10 eids = th.arange(num_edges) @@ -1455,13 +1604,13 @@ def check_less_hard_negs(hard_neg_sampler, target_etype, hard_neg_data, num_hard # 1. Only 4 hard negatives are returned # 2. Others will be random negatives hard_sampler = GSHardEdgeDstNegative(num_negs, "hard_negative", sampler, num_hard_negs=8) - check_less_hard_negs(hard_sampler, etype2, hard0, 4) + check_less_hard_negs(hard_sampler, etype0, hard0, 4) # Case 4: # 1. hard_negative field is string # 2. num_hard_neg is int # 3. num_negs == number of hard negatives required (10) - # 4. number of hard negatives required (8) == number of hard negatives + # 4. number of hard negatives required (10) == number of hard negatives # provided (hard1 has 10 negatives for each node) # 5.fast track # @@ -1552,6 +1701,51 @@ def check_more_hard_negs(hard_neg_sampler, target_etype, hard_neg_data): # 1. hard negatives will be a subset of hard2 check_more_hard_negs(hard_sampler, etype2, hard2) +def test_hard_edge_dst_negative_sample_gen_neg_pairs_complex_case(): + num_nodes = 1000 + # test GSHardEdgeDstNegative._generate when all some pos edges do not have enough hard negatives defined + num_negs = 10 + etype0, etype1, etype2, hard0, hard1, hard2, src, g = _create_hard_neg_graph(num_nodes, num_negs) + +def test_hard_edge_dst_negative_sample_gen_neg_pairs(): + num_nodes = 1000 + # test GSHardEdgeDstNegative._generate when all some pos edges do not have enough hard negatives defined + num_negs = 10 + etype0, etype1, etype2, hard0, hard1, hard2, src, g = _create_hard_neg_graph(num_nodes, num_negs) + + num_edges = 10 + pos_pairs = (th.arange(10), th.arange(10)) + + def check_less_hard_negs(pos_neg_tuple, etype, hard_neg_data, num_hard_neg): + neg_src, _, pos_dst, neg_dst = pos_neg_tuple[etype] + + assert len(neg_src) == num_edges + assert len(pos_dst) == num_edges + assert neg_dst.shape[0] == num_edges + assert neg_dst.shape[1] == num_negs + + # check hard negative + for i in range(num_edges): + hard_neg_dst = neg_dst[i][:num_hard_neg] + hard_neg_dst = set(hard_neg_dst.tolist()) + rand_neg_dst = neg_dst[i][num_hard_neg:] + rand_neg_dst = set(rand_neg_dst.tolist()) + hard_neg_set = set(hard_neg_data[i].tolist()) + assert hard_neg_dst.issubset(hard_neg_set) + assert rand_neg_dst.issubset(hard_neg_set) is False + + neg_src, _, pos_dst, neg_dst = pos_neg_tuple[etype1] + + sampler = GlobalUniform(num_negs) + hard_sampler = GSHardEdgeDstNegative(num_negs, "hard_negative", sampler, num_hard_negs=2) + pos_neg_tuple = hard_sampler.gen_neg_pairs(g, pos_pairs) + + check_less_hard_negs(pos_neg_tuple, etype0, hard0, 2) + check_less_hard_negs(pos_neg_tuple, etype1, hard1, 2) + check_less_hard_negs(pos_neg_tuple, etype2, hard2, 2) + + + @pytest.mark.parametrize("num_pos", [2, 10]) @pytest.mark.parametrize("num_neg", [5, 20]) def test_inbatch_joint_neg_sampler(num_pos, num_neg): @@ -1578,7 +1772,10 @@ def test_inbatch_joint_neg_sampler(num_pos, num_neg): if __name__ == '__main__': - test_hard_edge_dst_negative_sample_generate(100) + test_hard_edge_dst_negative_sample_gen_neg_pairs_complex_case() + test_hard_edge_dst_negative_sample_gen_neg_pairs() + test_hard_edge_dst_negative_sample_generate_complex_case() + test_hard_edge_dst_negative_sample_generate() test_inbatch_joint_neg_sampler(10, 20) test_np_dataloader_len(11) From 497ab0e42b40b855ffbc553727f62518c80e65d6 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Fri, 8 Dec 2023 14:30:41 -0800 Subject: [PATCH 04/17] fix bugs --- python/graphstorm/dataloading/sampler.py | 18 +- tests/unit-tests/test_dataloading.py | 264 +++++++++++++++++++++-- 2 files changed, 253 insertions(+), 29 deletions(-) diff --git a/python/graphstorm/dataloading/sampler.py b/python/graphstorm/dataloading/sampler.py index fdc1404f4b..f7753a54bb 100644 --- a/python/graphstorm/dataloading/sampler.py +++ b/python/graphstorm/dataloading/sampler.py @@ -206,8 +206,9 @@ def _gen_neg_pair(pos_pair, canonical_etype): dst_negative_field = None if dst_negative_field is None: - src, _, pos_dst, neg_dst = \ + random_neg_pairs = \ self._negative_sampler.gen_neg_pairs(g, {canonical_etype:pos_pair}) + src, _, pos_dst, neg_dst = random_neg_pairs[canonical_etype] return (src, None, pos_dst, neg_dst) hard_negatives = g.edges[canonical_etype].data[dst_negative_field][eids] @@ -219,26 +220,29 @@ def _gen_neg_pair(pos_pair, canonical_etype): # Fast track, there is no -1 in hard_negatives num_hard_neg = hard_negatives.shape[1] if self._k < num_hard_neg: - hard_negatives = hard_negatives[:self._k] + hard_negatives = hard_negatives[:,:self._k] return (src, None, pos_dst, hard_negatives) else: # random negative are needed - src, _, pos_dst, neg_dst = \ - self._negative_sampler.gen_neg_pairs(g, {canonical_etype:pos_pair}) + random_neg_pairs = \ + self._negative_sampler.gen_neg_pairs(g, + {canonical_etype:pos_pair}) + src, _, pos_dst, neg_dst = random_neg_pairs[canonical_etype] neg_dst[:,:num_hard_neg] = hard_negatives return (src, None, pos_dst, neg_dst) else: # slow track, we need to handle cases when there are -1s - hard_negatives, _ = th.sort(dim=1, descending=True) + hard_negatives, _ = th.sort(hard_negatives, dim=1, descending=True) - src, _, pos_dst, neg_dst = \ + random_neg_pairs = \ self._negative_sampler.gen_neg_pairs(g, {canonical_etype:pos_pair}) + src, _, pos_dst, neg_dst = random_neg_pairs[canonical_etype] for i in range(len(eids)): hard_negative = hard_negatives[i] # ignore -1s hard_negative = hard_negative[hard_negative > -1] num_hard_neg = hard_negative.shape[0] - neg_dst[:num_hard_neg if num_hard_neg < self._k else self._k] = \ + neg_dst[i][:num_hard_neg if num_hard_neg < self._k else self._k] = \ hard_negative[:num_hard_neg if num_hard_neg < self._k else self._k] return (src, _, pos_dst, neg_dst) diff --git a/tests/unit-tests/test_dataloading.py b/tests/unit-tests/test_dataloading.py index 52260e7042..e72fc656ca 100644 --- a/tests/unit-tests/test_dataloading.py +++ b/tests/unit-tests/test_dataloading.py @@ -1353,13 +1353,14 @@ def _create_hard_neg_graph(num_nodes, num_negs): g.edges[etype1].data["hard_negative"] = hard1 g.edges[etype2].data["hard_negative"] = hard2 - return etype0, etype1, etype2, hard0, hard1, hard2, src, g + return etype0, etype1, etype2, hard0, hard1, hard2, src, dst, g def test_hard_edge_dst_negative_sample_generate_complex_case(): + # test GSHardEdgeDstNegative._generate with slow track when not all the pos edges have enough hard negatives defined num_nodes = 1000 # test GSHardEdgeDstNegative._generate when all some pos edges do not have enough hard negatives defined num_negs = 10 - etype0, etype1, etype2, hard0, hard1, hard2, src, g = _create_hard_neg_graph(num_nodes, num_negs) + etype0, etype1, etype2, hard0, hard1, hard2, src, _, g = _create_hard_neg_graph(num_nodes, num_negs) # not enough predefined hard negatives # for hard0[0] and hard0[1] @@ -1386,7 +1387,7 @@ def test_hard_edge_dst_negative_sample_generate_complex_case(): num_edges = 10 eids = th.arange(num_edges) - def test_missing_hard_neg(neg_dst, num_hard_neg, hard_neg_data): + def test_missing_hard_negs(neg_dst, num_hard_neg, hard_neg_data): # hardx[0][-1] and hardx[0][-2] is -1, # which means hardx[0] does not enough predefined negatives # Random sample will be applied to -1s. @@ -1432,7 +1433,7 @@ def check_less_hard_negs(hard_neg_sampler, target_etype, hard_neg_data, neg_dst = neg_dst.reshape(num_edges, num_negs) if check_missing_hard_neg: - test_missing_hard_neg(neg_dst, num_hard_neg, hard_neg_data) + test_missing_hard_negs(neg_dst, num_hard_neg, hard_neg_data) start = 2 if check_missing_hard_neg else 0 for i in range(start, num_edges): @@ -1507,7 +1508,7 @@ def check_enough_hard_negs(hard_neg_sampler, target_etype, hard_neg_data): assert_equal(th.repeat_interleave(src[:10], num_negs, 0).numpy(), neg_src.numpy()) neg_dst = neg_dst.reshape(num_edges, num_negs) - test_missing_hard_neg(neg_dst, num_negs, hard_neg_data) + test_missing_hard_negs(neg_dst, num_negs, hard_neg_data) for i in range(2, num_edges): hard_neg_dst = set(neg_dst[i].tolist()) @@ -1543,7 +1544,7 @@ def test_hard_edge_dst_negative_sample_generate(): # test GSHardEdgeDstNegative._generate with fast track when all pos edges have enough hard negatives defined num_nodes = 100 num_negs = 10 - etype0, etype1, etype2, hard0, hard1, hard2, src, g = _create_hard_neg_graph(num_nodes, num_negs) + etype0, etype1, etype2, hard0, hard1, hard2, src, _, g = _create_hard_neg_graph(num_nodes, num_negs) num_edges = 10 eids = th.arange(num_edges) @@ -1559,8 +1560,6 @@ def check_less_hard_negs(hard_neg_sampler, target_etype, hard_neg_data, num_hard rand_neg_dst = neg_dst[i][num_hard_neg:] rand_neg_dst = set(rand_neg_dst.tolist()) hard_neg_set = set(hard_neg_data[i].tolist()) - print(hard_neg_dst) - print(hard_neg_set) assert hard_neg_dst.issubset(hard_neg_set) assert rand_neg_dst.issubset(hard_neg_set) is False @@ -1625,9 +1624,6 @@ def check_enough_hard_negs(hard_neg_sampler, target_etype, hard_neg_data): for i in range(num_edges): hard_neg_dst = set(neg_dst[i].tolist()) hard_neg_set = set(hard_neg_data[i].tolist()) - print(i) - print(hard_neg_dst) - print(hard_neg_set) assert hard_neg_dst == hard_neg_set hard_sampler = GSHardEdgeDstNegative(num_negs, "hard_negative", sampler, num_hard_negs=num_negs) check_enough_hard_negs(hard_sampler, etype1, hard1) @@ -1701,28 +1697,180 @@ def check_more_hard_negs(hard_neg_sampler, target_etype, hard_neg_data): # 1. hard negatives will be a subset of hard2 check_more_hard_negs(hard_sampler, etype2, hard2) + def check_none_hard_negs(hard_neg_sampler, target_etype, hard_neg_data): + neg_src, neg_dst = hard_neg_sampler._generate(g, eids, target_etype) + assert len(neg_src) == num_edges * num_negs + assert len(neg_dst) == num_edges * num_negs + assert_equal(th.repeat_interleave(src[:10], num_negs, 0).numpy(), neg_src.numpy()) + neg_dst = neg_dst.reshape(num_edges, num_negs) + for i in range(num_edges): + hard_neg_dst = set(neg_dst[i].tolist()) + hard_neg_set = set(hard_neg_data[i].tolist()) + assert hard_neg_dst.issubset(hard_neg_set) is False + # Case 9: + # dst_negative_field is not provided + hard_sampler = GSHardEdgeDstNegative( + num_negs, {}, sampler, 2) + check_none_hard_negs(hard_sampler, etype2, hard2) + + # Case 10: + # num_hard_negs is not provided + hard_sampler = GSHardEdgeDstNegative( + num_negs, "hard_negative", sampler, {}) + check_none_hard_negs(hard_sampler, etype2, hard2) + def test_hard_edge_dst_negative_sample_gen_neg_pairs_complex_case(): + # test GSHardEdgeDstNegative.gen_neg_pairs with slow track when not all edges have enough predefined negatives num_nodes = 1000 # test GSHardEdgeDstNegative._generate when all some pos edges do not have enough hard negatives defined num_negs = 10 - etype0, etype1, etype2, hard0, hard1, hard2, src, g = _create_hard_neg_graph(num_nodes, num_negs) + etype0, etype1, etype2, hard0, hard1, hard2, src, dst, g = _create_hard_neg_graph(num_nodes, num_negs) + + # not enough predefined hard negatives + # for hard0[0] and hard0[1] + hard0[0] = th.randperm(num_nodes)[:4] + hard0[0][-1] = -1 + hard0[0][-2] = -1 + hard0[1][-1] = -1 + + # not enough predefined hard negatives + # for hard0[0] and hard0[1] + hard1[0] = th.randperm(num_nodes)[:num_negs] + hard1[1] = th.randperm(num_nodes)[:num_negs] + hard1[0][-1] = -1 + hard1[0][-2] = -1 + hard1[1][-1] = -1 + + # not enough predefined hard negatives + # for hard0[0] and hard0[1] + hard2[0] = th.randperm(num_nodes)[:num_negs*2] + hard2[1] = th.randperm(num_nodes)[:num_negs*2] + hard2[0][-1] = -1 + hard2[0][-2] = -1 + hard2[1][-1] = -1 + + num_edges = 10 + pos_pairs = {etype0: (th.arange(10), th.arange(10)), + etype1: (th.arange(10), th.arange(10)), + etype2: (th.arange(10), th.arange(10))} + + def test_missing_hard_negs(neg_dst, hard_neg_data, num_hard_neg): + # hardx[0][-1] and hardx[0][-2] is -1, + # which means hardx[0] does not enough predefined negatives + # Random sample will be applied to -1s. + hard_neg_dst = neg_dst[0][:num_hard_neg] + hard_neg_rand_0 = hard_neg_dst[-1] + hard_neg_rand_1 = hard_neg_dst[-2] + hard_neg_dst = set(hard_neg_dst[:-2].tolist()) + rand_neg_dst = neg_dst[0][num_hard_neg:] + rand_neg_dst = set(rand_neg_dst.tolist()) + hard_neg_set = set(hard_neg_data[0].tolist()) + assert hard_neg_dst.issubset(hard_neg_set) + assert len(rand_neg_dst) == 0 or \ + rand_neg_dst.issubset(hard_neg_set) is False + + rand_0_check = hard_neg_rand_0 not in hard_neg_set + rand_1_check = hard_neg_rand_1 not in hard_neg_set + + # hardx[1][-1] is -1, + # which means hardx[0] does not enough predefined negatives + # Random sample will be applied to -1s. + hard_neg_dst = neg_dst[1][:num_hard_neg] + hard_neg_rand_2 = hard_neg_dst[-1] + hard_neg_dst = set(hard_neg_dst[:-1].tolist()) + rand_neg_dst = neg_dst[1][num_hard_neg:] + rand_neg_dst = set(rand_neg_dst.tolist()) + hard_neg_set = set(hard_neg_data[1].tolist()) + assert hard_neg_dst.issubset(hard_neg_set) + assert len(rand_neg_dst) == 0 or \ + rand_neg_dst.issubset(hard_neg_set) is False + + rand_2_check = hard_neg_rand_2 not in hard_neg_set + # The chance is very to to have rand_0_check, + # rand_1_check and rand_2_check be true at the same time + # The change is (4/1000)^3 + assert rand_0_check or rand_1_check or rand_2_check + + def check_hard_negs(pos_neg_tuple, etype, hard_neg_data, + num_hard_neg, check_missing_hard_neg): + neg_src, _, pos_dst, neg_dst = pos_neg_tuple[etype] + + assert len(neg_src) == num_edges + assert len(pos_dst) == num_edges + assert neg_dst.shape[0] == num_edges + assert neg_dst.shape[1] == num_negs + assert_equal(src[:10].numpy(), neg_src.numpy()) + assert_equal(dst[:10].numpy(), pos_dst.numpy()) + + if check_missing_hard_neg: + test_missing_hard_negs(neg_dst, hard_neg_data, num_hard_neg) + + start = 2 if check_missing_hard_neg else 0 + for i in range(start, num_edges): + hard_neg_dst = neg_dst[i][:num_hard_neg] + hard_neg_dst = set(hard_neg_dst.tolist()) + rand_neg_dst = neg_dst[i][num_hard_neg:] + rand_neg_dst = set(rand_neg_dst.tolist()) + hard_neg_set = set(hard_neg_data[i].tolist()) + assert hard_neg_dst.issubset(hard_neg_set) + assert len(rand_neg_dst) == 0 or \ + rand_neg_dst.issubset(hard_neg_set) is False + + sampler = GlobalUniform(num_negs) + hard_sampler = GSHardEdgeDstNegative(num_negs, "hard_negative", sampler) + pos_neg_tuple = hard_sampler.gen_neg_pairs(g, pos_pairs) + + # Case 1: + # 1. hard_negative field is string + # 2. The is not enough predefined negative for gen_neg_pairs + # 3. fast track + # 4. slow track (-1 exists in hard neg feature) + # + # expected behavior: + # 1. Only 4 hard negatives are returned + # 2. Others will be random negatives + check_hard_negs(pos_neg_tuple, etype0, hard0, hard0.shape[1], check_missing_hard_neg=True) + # Case 2: + # 1. hard_negative field is string + # 2. num_negs == total number of predefined negatives + # 3. fast track + # 4. slow track (-1 exists in hard neg feature) + # + # expected behavior: + # 1. all negatives are predefined negatives + check_hard_negs(pos_neg_tuple, etype1, hard1, hard1.shape[1], check_missing_hard_neg=True) + # Case 3: + # 1. hard_negative field is string + # 2. num_negs < total number of predefined negatives + # 3. fast track + # 4. slow track (-1 exists in hard neg feature) + # + # expected behavior: + # 1. all negatives are predefined negatives + check_hard_negs(pos_neg_tuple, etype2, hard2, num_negs, check_missing_hard_neg=False) + def test_hard_edge_dst_negative_sample_gen_neg_pairs(): + # test GSHardEdgeDstNegative.gen_neg_pairs with fast track when all edges have enough predefined negatives num_nodes = 1000 # test GSHardEdgeDstNegative._generate when all some pos edges do not have enough hard negatives defined num_negs = 10 - etype0, etype1, etype2, hard0, hard1, hard2, src, g = _create_hard_neg_graph(num_nodes, num_negs) + etype0, etype1, etype2, hard0, hard1, hard2, src, dst, g = _create_hard_neg_graph(num_nodes, num_negs) num_edges = 10 - pos_pairs = (th.arange(10), th.arange(10)) + pos_pairs = {etype0: (th.arange(10), th.arange(10)), + etype1: (th.arange(10), th.arange(10)), + etype2: (th.arange(10), th.arange(10))} - def check_less_hard_negs(pos_neg_tuple, etype, hard_neg_data, num_hard_neg): + def check_hard_negs(pos_neg_tuple, etype, hard_neg_data, num_hard_neg): neg_src, _, pos_dst, neg_dst = pos_neg_tuple[etype] assert len(neg_src) == num_edges assert len(pos_dst) == num_edges assert neg_dst.shape[0] == num_edges assert neg_dst.shape[1] == num_negs + assert_equal(src[:10].numpy(), neg_src.numpy()) + assert_equal(dst[:10].numpy(), pos_dst.numpy()) # check hard negative for i in range(num_edges): @@ -1732,19 +1880,91 @@ def check_less_hard_negs(pos_neg_tuple, etype, hard_neg_data, num_hard_neg): rand_neg_dst = set(rand_neg_dst.tolist()) hard_neg_set = set(hard_neg_data[i].tolist()) assert hard_neg_dst.issubset(hard_neg_set) - assert rand_neg_dst.issubset(hard_neg_set) is False - - neg_src, _, pos_dst, neg_dst = pos_neg_tuple[etype1] + assert len(rand_neg_dst) == 0 or \ + rand_neg_dst.issubset(hard_neg_set) is False sampler = GlobalUniform(num_negs) - hard_sampler = GSHardEdgeDstNegative(num_negs, "hard_negative", sampler, num_hard_negs=2) + hard_sampler = GSHardEdgeDstNegative(num_negs, "hard_negative", sampler) pos_neg_tuple = hard_sampler.gen_neg_pairs(g, pos_pairs) - check_less_hard_negs(pos_neg_tuple, etype0, hard0, 2) - check_less_hard_negs(pos_neg_tuple, etype1, hard1, 2) - check_less_hard_negs(pos_neg_tuple, etype2, hard2, 2) + # Case 1: + # 1. hard_negative field is string + # 2. The is not enough predefined negative for gen_neg_pairs + # 3. fast track + # + # expected behavior: + # 1. Only 4 hard negatives are returned + # 2. Others will be random negatives + check_hard_negs(pos_neg_tuple, etype0, hard0, hard0.shape[1]) + # Case 2: + # 1. hard_negative field is string + # 2. num_negs == total number of predefined negatives + # 3. fast track + # + # expected behavior: + # 1. all negatives are predefined negatives + check_hard_negs(pos_neg_tuple, etype1, hard1, hard1.shape[1]) + # Case 3: + # 1. hard_negative field is string + # 2. num_negs < total number of predefined negatives + # 3. fast track + # + # expected behavior: + # 1. all negatives are predefined negatives + check_hard_negs(pos_neg_tuple, etype2, hard2, num_negs) + + hard_sampler = GSHardEdgeDstNegative(num_negs, + {etype0: "hard_negative", + etype1: "hard_negative", + etype2: "hard_negative"}, + sampler) + # Case 4: + # 1. hard_negative field is dict + # 2. The is not enough predefined negative for gen_neg_pairs + # 3. fast track + # + # expected behavior: + # 1. Only 4 hard negatives are returned + # 2. Others will be random negatives + check_hard_negs(pos_neg_tuple, etype0, hard0, hard0.shape[1]) + # Case 5: + # 1. hard_negative field is dict + # 2. num_negs == total number of predefined negatives + # 3. fast track + # + # expected behavior: + # 1. all negatives are predefined negatives + check_hard_negs(pos_neg_tuple, etype1, hard1, hard1.shape[1]) + # Case 6: + # 1. hard_negative field is dict + # 2. num_negs < total number of predefined negatives + # 3. fast track + # + # expected behavior: + # 1. all negatives are predefined negatives + check_hard_negs(pos_neg_tuple, etype2, hard2, num_negs) + + def check_none_hard_negs(pos_neg_tuple, etype, hard_neg_data): + neg_src, _, pos_dst, neg_dst = pos_neg_tuple[etype] + + assert len(neg_src) == num_edges + assert len(pos_dst) == num_edges + assert neg_dst.shape[0] == num_edges + assert neg_dst.shape[1] == num_negs + assert_equal(src[:10].numpy(), neg_src.numpy()) + assert_equal(dst[:10].numpy(), pos_dst.numpy()) + for i in range(num_edges): + hard_neg_dst = set(neg_dst[i].tolist()) + hard_neg_set = set(hard_neg_data[i].tolist()) + assert hard_neg_dst.issubset(hard_neg_set) is False + # Case 9: + # dst_negative_field is not provided + hard_sampler = GSHardEdgeDstNegative( + num_negs, {}, sampler) + pos_neg_tuple = hard_sampler.gen_neg_pairs(g, pos_pairs) + check_none_hard_negs(pos_neg_tuple, etype2, hard2) @pytest.mark.parametrize("num_pos", [2, 10]) @pytest.mark.parametrize("num_neg", [5, 20]) From 7961e4273483ef84ba179f247108f955dd577713 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Tue, 19 Dec 2023 01:18:12 -0800 Subject: [PATCH 05/17] resolve some comments --- python/graphstorm/config/argument.py | 79 +++++++++++++++----- python/graphstorm/dataloading/dataloading.py | 6 +- python/graphstorm/dataloading/sampler.py | 19 ++++- tests/unit-tests/test_dataloading.py | 42 +++++------ 4 files changed, 102 insertions(+), 44 deletions(-) diff --git a/python/graphstorm/config/argument.py b/python/graphstorm/config/argument.py index e8416d1406..a017b915e8 100644 --- a/python/graphstorm/config/argument.py +++ b/python/graphstorm/config/argument.py @@ -1875,25 +1875,26 @@ def lp_edge_weight_for_loss(self): return None @property - def train_hard_edge_dstnode_negative(self): - """ The list of canonical etypes that have hard negative sets + def train_etypes_negative_dstnode(self): + """ The list of canonical etypes that have hard negative edges + constructed by corrupting destination nodes. The format of the arguement should be: - train_hard_edge_dstnode_negative: + train_etypes_negative_dstnode: - src_type,rel_type0,dst_type:negative_nid_field - src_type,rel_type1,dst_type:negative_nid_field Each edge type can have different fields storing the hard negatives. or - train_hard_edge_dstnode_negative: + train_etypes_negative_dstnode: - negative_nid_field All the edge types use the same filed storing the hard negatives. """ # pylint: disable=no-member - if hasattr(self, "_train_hard_edge_dstnode_negative"): + if hasattr(self, "_train_etypes_negative_dstnode"): assert self.task_type == BUILTIN_TASK_LINK_PREDICTION, \ "Hard negative only works with link prediction" - hard_negatives = self._train_hard_edge_dstnode_negative + hard_negatives = self._train_etypes_negative_dstnode if len(hard_negatives) == 1 and \ ":" not in hard_negatives[0]: # global feat_name @@ -1903,7 +1904,14 @@ def train_hard_edge_dstnode_negative(self): hard_negative_dict = {} for hard_negative in hard_negatives: negative_info = hard_negative.split(":") + assert len(negative_info) == 2, \ + "negative dstnode information must be provided in format of " \ + f"src,relation,dst:feature_name, but get {hard_negative}" + etype = tuple(negative_info[0].split(",")) + assert len(etype) == 3, \ + f"Edge type must in format of (src,relation,dst), but get {etype}" + assert etype not in hard_negative_dict, \ f"You already specify the fixed negative of {etype} " \ f"as {hard_negative_dict[etype]}" @@ -1915,25 +1923,25 @@ def train_hard_edge_dstnode_negative(self): return None @property - def num_hard_negatives(self): + def num_train_hard_negatives(self): """ Number of hard negatives per edge type The format of the arguement should be: - num_hard_negatives: + num_train_hard_negatives: - src_type,rel_type0,dst_type:num_negatives - src_type,rel_type1,dst_type:num_negatives Each edge type can have different number of hard negatives. or - num_hard_negatives: + num_train_hard_negatives: - num_negatives All the edge types use the same number of hard negatives. """ # pylint: disable=no-member - if hasattr(self, "_num_hard_negatives"): + if hasattr(self, "_num_train_hard_negatives"): assert self.task_type == BUILTIN_TASK_LINK_PREDICTION, \ "Hard negative only works with link prediction" - num_negatives = self._num_hard_negatives + num_negatives = self._num_train_hard_negatives if len(num_negatives) == 1 and \ ":" not in num_negatives[0]: # global feat_name @@ -1954,25 +1962,26 @@ def num_hard_negatives(self): return None @property - def eval_fixed_edge_dstnode_negative(self): - """ The list of canonical etypes that have predefined negative sets + def eval_etypes_negative_dstnode(self): + """ The list of canonical etypes that have predefined negative edges + constructed by corrupting destination nodes. The format of the arguement should be: - eval_fixed_edge_dstnode_negative: + eval_etypes_negative_dstnode: - src_type,rel_type0,dst_type:negative_nid_field - src_type,rel_type1,dst_type:negative_nid_field Each edge type can have different fields storing the fixed negatives. or - eval_fixed_edge_dstnode_negative: + eval_etypes_negative_dstnode: - negative_nid_field All the edge types use the same filed storing the fixed negatives. """ # pylint: disable=no-member - if hasattr(self, "_eval_fixed_edge_dstnode_negative"): + if hasattr(self, "_eval_etypes_negative_dstnode"): assert self.task_type == BUILTIN_TASK_LINK_PREDICTION, \ "Fixed negative only works with link prediction" - fixed_negatives = self._eval_fixed_edge_dstnode_negative + fixed_negatives = self._eval_etypes_negative_dstnode if len(fixed_negatives) == 1 and \ ":" not in fixed_negatives[0]: # global feat_name @@ -1982,7 +1991,13 @@ def eval_fixed_edge_dstnode_negative(self): fixed_negative_dict = {} for fixed_negative in fixed_negatives: negative_info = fixed_negative.split(":") + assert len(negative_info) == 2, \ + "negative dstnode information must be provided in format of " \ + f"src,relation,dst:feature_name, but get {fixed_negative}" + etype = tuple(negative_info[0].split(",")) + assert len(etype) == 3, \ + f"Edge type must in format of (src,relation,dst), but get {etype}" assert etype not in fixed_negative_dict, \ f"You already specify the fixed negative of {etype} " \ f"as {fixed_negative_dict[etype]}" @@ -2599,6 +2614,36 @@ def _add_link_prediction_args(parser): "metrics of each edge type to select the best model" "2) '--model-select-etype query,adds,item': Use the evaluation " "metric of the edge type (query,adds,item) to select the best model") + group.add_argument("--train-etypes-negative-dstnode", nargs='+', + type=str, default=argparse.SUPPRESS, + help="Edge feature field name for user defined negative destination ndoes " + "for training. The negative nodes are used to construct hard negative edges " + "by corrupting positive edges' destination nodes." + "It can be in following format: " + "1) '--train-etypes-negative-dstnode negative_nid_field', " + "if all edge types use the same negative destination node filed." + "2) '--train-etypes-negative-dstnode query,adds,asin:neg0 query,clicks,asin:neg1 ...'" + "Different edge types have different negative destination node fields." + ) + group.add_argument("--eval-etypes-negative-dstnode", nargs='+', + type=str, default=argparse.SUPPRESS, + help="Edge feature field name for user defined negative destination ndoes " + "for evaluation. The negative nodes are used to construct negative edges " + "by corrupting test edges' destination nodes." + "It can be in following format: " + "1) '--eval-etypes-negative-dstnode negative_nid_field', " + "if all edge types use the same negative destination node filed." + "2) '--eval-etypes-negative-dstnode query,adds,asin:neg0 query,clicks,asin:neg1 ...'" + "Different edge types have different negative destination node fields." + ) + group.add_argument("--num-train-hard-negatives", nargs='+', + type=str, default=argparse.SUPPRESS, + help="Number of hard negatives for each edge type during training." + "It can be in following format: " + "1) '--num-train-hard-negatives 10', " + "if all edge types use the same number of hard negatives." + "2) '--num-train-hard-negatives query,adds,asin:5 query,clicks,asin:10 ...'" + "Different edge types have different number of hard negatives.") return parser diff --git a/python/graphstorm/dataloading/dataloading.py b/python/graphstorm/dataloading/dataloading.py index afa1a7bf5c..53d4e4ec15 100644 --- a/python/graphstorm/dataloading/dataloading.py +++ b/python/graphstorm/dataloading/dataloading.py @@ -34,7 +34,7 @@ InbatchJointUniform, FastMultiLayerNeighborSampler, DistributedFileSampler, - GSHardEdgeDstNegative) + GSHardEdgeDstNegativeSampler) from .utils import trim_data, modify_fanout_for_target_etype from .dataset import GSDistillData @@ -1014,7 +1014,7 @@ def _prepare_negative_sampler(self, num_negative_edges): self._neg_sample_type = BUILTIN_LP_UNIFORM_NEG_SAMPLER negative_sampler = GlobalUniform(num_negative_edges) if self._fixed_edge_dst_negative_field: - negative_sampler = GSHardEdgeDstNegative(num_negative_edges, + negative_sampler = GSHardEdgeDstNegativeSampler(num_negative_edges, self._fixed_edge_dst_negative_field, negative_sampler) return negative_sampler @@ -1066,7 +1066,7 @@ def _prepare_negative_sampler(self, num_negative_edges): negative_sampler = JointUniform(num_negative_edges) self._neg_sample_type = BUILTIN_LP_JOINT_NEG_SAMPLER if self._fixed_edge_dst_negative_field: - negative_sampler = GSHardEdgeDstNegative(num_negative_edges, + negative_sampler = GSHardEdgeDstNegativeSampler(num_negative_edges, self._fixed_edge_dst_negative_field, negative_sampler) return negative_sampler diff --git a/python/graphstorm/dataloading/sampler.py b/python/graphstorm/dataloading/sampler.py index f7753a54bb..01858caf97 100644 --- a/python/graphstorm/dataloading/sampler.py +++ b/python/graphstorm/dataloading/sampler.py @@ -72,8 +72,8 @@ def _generate(self, g, eids, canonical_etype): dst = F.randint(shape, dtype, ctx, 0, self._local_neg_nids[vtype].shape[0]) return src, self._local_neg_nids[vtype][dst] -class GSHardEdgeDstNegative(object): - """ GraphStorm negativer sampler that chooses negative destination nodes +class GSHardEdgeDstNegativeSampler(object): + """ GraphStorm negative sampler that chooses negative destination nodes from a fixed set to create negative edges. Parameters @@ -127,7 +127,6 @@ def _generate(self, g, eids, canonical_etype): # shuffle the hard negatives hard_negatives = hard_negatives[:,neg_idx] - print(f"{required_num_hard_neg} {max_num_hard_neg} {self._k}") if required_num_hard_neg >= self._k and max_num_hard_neg >= self._k: # All negative should be hard negative and # there are enough hard negatives. @@ -175,6 +174,20 @@ def _generate(self, g, eids, canonical_etype): hard_negative[:num_hard_neg if num_hard_neg < self._k else self._k] return src, neg +class GSFixedEdgeDstNegativeSampler(object): + """ GraphStorm negative sampler that uses fixed negative destination nodes + to create negative edges. + + Parameters + ---------- + dst_negative_field: str or dict of str + The field storing the hard negatives. + """ + def __init__(self, dst_negative_field): + assert is_wholegraph() is False, \ + "Hard negative is not supported for WholeGraph." + self._dst_negative_field = dst_negative_field + def gen_neg_pairs(self, g, pos_pairs): """ Returns negative examples associated with positive examples. It only return dst negatives. diff --git a/tests/unit-tests/test_dataloading.py b/tests/unit-tests/test_dataloading.py index e72fc656ca..025309e0aa 100644 --- a/tests/unit-tests/test_dataloading.py +++ b/tests/unit-tests/test_dataloading.py @@ -58,7 +58,7 @@ from graphstorm.dataloading.sampler import InbatchJointUniform from graphstorm.dataloading.sampler import GlobalUniform -from graphstorm.dataloading.sampler import GSHardEdgeDstNegative +from graphstorm.dataloading.sampler import GSHardEdgeDstNegativeSampler from graphstorm.dataloading.dataset import (prepare_batch_input, prepare_batch_edge_input) @@ -1356,9 +1356,9 @@ def _create_hard_neg_graph(num_nodes, num_negs): return etype0, etype1, etype2, hard0, hard1, hard2, src, dst, g def test_hard_edge_dst_negative_sample_generate_complex_case(): - # test GSHardEdgeDstNegative._generate with slow track when not all the pos edges have enough hard negatives defined + # test GSHardEdgeDstNegativeSampler._generate with slow track when not all the pos edges have enough hard negatives defined num_nodes = 1000 - # test GSHardEdgeDstNegative._generate when all some pos edges do not have enough hard negatives defined + # test GSHardEdgeDstNegativeSampler._generate when all some pos edges do not have enough hard negatives defined num_negs = 10 etype0, etype1, etype2, hard0, hard1, hard2, src, _, g = _create_hard_neg_graph(num_nodes, num_negs) @@ -1458,7 +1458,7 @@ def check_less_hard_negs(hard_neg_sampler, target_etype, hard_neg_data, # 1. Only 2 hard negatives are returned # 2. Others will be random negatives sampler = GlobalUniform(num_negs) - hard_sampler = GSHardEdgeDstNegative(num_negs, "hard_negative", sampler, num_hard_negs=2) + hard_sampler = GSHardEdgeDstNegativeSampler(num_negs, "hard_negative", sampler, num_hard_negs=2) check_less_hard_negs(hard_sampler, etype0, hard0, 2, check_missing_hard_neg=False) # Case 2: @@ -1488,7 +1488,7 @@ def check_less_hard_negs(hard_neg_sampler, target_etype, hard_neg_data, # 2. Others will be random negatives # 3. eid 0 will have 2 more random negatives # and eid 1 will have 1 more random negatives - hard_sampler = GSHardEdgeDstNegative(num_negs, "hard_negative", sampler, num_hard_negs=8) + hard_sampler = GSHardEdgeDstNegativeSampler(num_negs, "hard_negative", sampler, num_hard_negs=8) check_less_hard_negs(hard_sampler, etype0, hard0, 4, check_missing_hard_neg=True) # Case 4: @@ -1515,7 +1515,7 @@ def check_enough_hard_negs(hard_neg_sampler, target_etype, hard_neg_data): hard_neg_set = set(hard_neg_data[i].tolist()) assert hard_neg_dst == hard_neg_set - hard_sampler = GSHardEdgeDstNegative(num_negs, "hard_negative", sampler, num_hard_negs=num_negs) + hard_sampler = GSHardEdgeDstNegativeSampler(num_negs, "hard_negative", sampler, num_hard_negs=num_negs) check_enough_hard_negs(hard_sampler, etype1, hard1) # Case 5: @@ -1541,7 +1541,7 @@ def check_more_hard_negs(hard_neg_sampler, target_etype, hard_neg_data): check_more_hard_negs(hard_sampler, etype2, hard2) def test_hard_edge_dst_negative_sample_generate(): - # test GSHardEdgeDstNegative._generate with fast track when all pos edges have enough hard negatives defined + # test GSHardEdgeDstNegativeSampler._generate with fast track when all pos edges have enough hard negatives defined num_nodes = 100 num_negs = 10 etype0, etype1, etype2, hard0, hard1, hard2, src, _, g = _create_hard_neg_graph(num_nodes, num_negs) @@ -1575,7 +1575,7 @@ def check_less_hard_negs(hard_neg_sampler, target_etype, hard_neg_data, num_hard # 1. Only 2 hard negatives are returned # 2. Others will be random negatives sampler = GlobalUniform(num_negs) - hard_sampler = GSHardEdgeDstNegative(num_negs, "hard_negative", sampler, num_hard_negs=2) + hard_sampler = GSHardEdgeDstNegativeSampler(num_negs, "hard_negative", sampler, num_hard_negs=2) check_less_hard_negs(hard_sampler, etype0, hard0, 2) # Case 2: @@ -1602,7 +1602,7 @@ def check_less_hard_negs(hard_neg_sampler, target_etype, hard_neg_data, num_hard # expected behavior: # 1. Only 4 hard negatives are returned # 2. Others will be random negatives - hard_sampler = GSHardEdgeDstNegative(num_negs, "hard_negative", sampler, num_hard_negs=8) + hard_sampler = GSHardEdgeDstNegativeSampler(num_negs, "hard_negative", sampler, num_hard_negs=8) check_less_hard_negs(hard_sampler, etype0, hard0, 4) # Case 4: @@ -1625,7 +1625,7 @@ def check_enough_hard_negs(hard_neg_sampler, target_etype, hard_neg_data): hard_neg_dst = set(neg_dst[i].tolist()) hard_neg_set = set(hard_neg_data[i].tolist()) assert hard_neg_dst == hard_neg_set - hard_sampler = GSHardEdgeDstNegative(num_negs, "hard_negative", sampler, num_hard_negs=num_negs) + hard_sampler = GSHardEdgeDstNegativeSampler(num_negs, "hard_negative", sampler, num_hard_negs=num_negs) check_enough_hard_negs(hard_sampler, etype1, hard1) # Case 5: @@ -1661,7 +1661,7 @@ def check_more_hard_negs(hard_neg_sampler, target_etype, hard_neg_data): # expected behavior: # 1. Only 2 hard negatives are returned # 2. Others will be random negatives - hard_sampler = GSHardEdgeDstNegative( + hard_sampler = GSHardEdgeDstNegativeSampler( num_negs, {etype0: "hard_negative", etype1: "hard_negative", @@ -1709,20 +1709,20 @@ def check_none_hard_negs(hard_neg_sampler, target_etype, hard_neg_data): assert hard_neg_dst.issubset(hard_neg_set) is False # Case 9: # dst_negative_field is not provided - hard_sampler = GSHardEdgeDstNegative( + hard_sampler = GSHardEdgeDstNegativeSampler( num_negs, {}, sampler, 2) check_none_hard_negs(hard_sampler, etype2, hard2) # Case 10: # num_hard_negs is not provided - hard_sampler = GSHardEdgeDstNegative( + hard_sampler = GSHardEdgeDstNegativeSampler( num_negs, "hard_negative", sampler, {}) check_none_hard_negs(hard_sampler, etype2, hard2) def test_hard_edge_dst_negative_sample_gen_neg_pairs_complex_case(): - # test GSHardEdgeDstNegative.gen_neg_pairs with slow track when not all edges have enough predefined negatives + # test GSHardEdgeDstNegativeSampler.gen_neg_pairs with slow track when not all edges have enough predefined negatives num_nodes = 1000 - # test GSHardEdgeDstNegative._generate when all some pos edges do not have enough hard negatives defined + # test GSHardEdgeDstNegativeSampler._generate when all some pos edges do not have enough hard negatives defined num_negs = 10 etype0, etype1, etype2, hard0, hard1, hard2, src, dst, g = _create_hard_neg_graph(num_nodes, num_negs) @@ -1817,7 +1817,7 @@ def check_hard_negs(pos_neg_tuple, etype, hard_neg_data, rand_neg_dst.issubset(hard_neg_set) is False sampler = GlobalUniform(num_negs) - hard_sampler = GSHardEdgeDstNegative(num_negs, "hard_negative", sampler) + hard_sampler = GSHardEdgeDstNegativeSampler(num_negs, "hard_negative", sampler) pos_neg_tuple = hard_sampler.gen_neg_pairs(g, pos_pairs) # Case 1: @@ -1851,9 +1851,9 @@ def check_hard_negs(pos_neg_tuple, etype, hard_neg_data, def test_hard_edge_dst_negative_sample_gen_neg_pairs(): - # test GSHardEdgeDstNegative.gen_neg_pairs with fast track when all edges have enough predefined negatives + # test GSHardEdgeDstNegativeSampler.gen_neg_pairs with fast track when all edges have enough predefined negatives num_nodes = 1000 - # test GSHardEdgeDstNegative._generate when all some pos edges do not have enough hard negatives defined + # test GSHardEdgeDstNegativeSampler._generate when all some pos edges do not have enough hard negatives defined num_negs = 10 etype0, etype1, etype2, hard0, hard1, hard2, src, dst, g = _create_hard_neg_graph(num_nodes, num_negs) @@ -1884,7 +1884,7 @@ def check_hard_negs(pos_neg_tuple, etype, hard_neg_data, num_hard_neg): rand_neg_dst.issubset(hard_neg_set) is False sampler = GlobalUniform(num_negs) - hard_sampler = GSHardEdgeDstNegative(num_negs, "hard_negative", sampler) + hard_sampler = GSHardEdgeDstNegativeSampler(num_negs, "hard_negative", sampler) pos_neg_tuple = hard_sampler.gen_neg_pairs(g, pos_pairs) # Case 1: @@ -1913,7 +1913,7 @@ def check_hard_negs(pos_neg_tuple, etype, hard_neg_data, num_hard_neg): # 1. all negatives are predefined negatives check_hard_negs(pos_neg_tuple, etype2, hard2, num_negs) - hard_sampler = GSHardEdgeDstNegative(num_negs, + hard_sampler = GSHardEdgeDstNegativeSampler(num_negs, {etype0: "hard_negative", etype1: "hard_negative", etype2: "hard_negative"}, @@ -1961,7 +1961,7 @@ def check_none_hard_negs(pos_neg_tuple, etype, hard_neg_data): # Case 9: # dst_negative_field is not provided - hard_sampler = GSHardEdgeDstNegative( + hard_sampler = GSHardEdgeDstNegativeSampler( num_negs, {}, sampler) pos_neg_tuple = hard_sampler.gen_neg_pairs(g, pos_pairs) check_none_hard_negs(pos_neg_tuple, etype2, hard2) From f9b87403d21a074cf589b86f2c64b1e4ec97a248 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Tue, 19 Dec 2023 23:42:06 -0800 Subject: [PATCH 06/17] update --- python/graphstorm/dataloading/sampler.py | 47 ++--- tests/unit-tests/test_dataloading.py | 245 ++--------------------- 2 files changed, 33 insertions(+), 259 deletions(-) diff --git a/python/graphstorm/dataloading/sampler.py b/python/graphstorm/dataloading/sampler.py index 01858caf97..bd0cfa0c29 100644 --- a/python/graphstorm/dataloading/sampler.py +++ b/python/graphstorm/dataloading/sampler.py @@ -224,40 +224,19 @@ def _gen_neg_pair(pos_pair, canonical_etype): src, _, pos_dst, neg_dst = random_neg_pairs[canonical_etype] return (src, None, pos_dst, neg_dst) - hard_negatives = g.edges[canonical_etype].data[dst_negative_field][eids] - # It is possible that different edges may have different number of - # pre-defined negatives. For pre-defined negatives, the corresponding - # value in `hard_negatives` will be integers representing the node ids. - # For others, they will be -1s meaning there are missing fixed negatives. - if th.sum(hard_negatives == -1) == 0: - # Fast track, there is no -1 in hard_negatives - num_hard_neg = hard_negatives.shape[1] - if self._k < num_hard_neg: - hard_negatives = hard_negatives[:,:self._k] - return (src, None, pos_dst, hard_negatives) - else: - # random negative are needed - random_neg_pairs = \ - self._negative_sampler.gen_neg_pairs(g, - {canonical_etype:pos_pair}) - src, _, pos_dst, neg_dst = random_neg_pairs[canonical_etype] - neg_dst[:,:num_hard_neg] = hard_negatives - return (src, None, pos_dst, neg_dst) - else: - # slow track, we need to handle cases when there are -1s - hard_negatives, _ = th.sort(hard_negatives, dim=1, descending=True) - - random_neg_pairs = \ - self._negative_sampler.gen_neg_pairs(g, {canonical_etype:pos_pair}) - src, _, pos_dst, neg_dst = random_neg_pairs[canonical_etype] - for i in range(len(eids)): - hard_negative = hard_negatives[i] - # ignore -1s - hard_negative = hard_negative[hard_negative > -1] - num_hard_neg = hard_negative.shape[0] - neg_dst[i][:num_hard_neg if num_hard_neg < self._k else self._k] = \ - hard_negative[:num_hard_neg if num_hard_neg < self._k else self._k] - return (src, _, pos_dst, neg_dst) + fixed_negatives = g.edges[canonical_etype].data[dst_negative_field][eids] + + # Users may use HardEdgeDstNegativeTransform + # to prepare the fixed negatives. + assert th.sum(fixed_negatives == -1) == 0, \ + "When using fixed negative destination nodes to construct testing edges," \ + "it is required that for each positive edge there are enough negative " \ + f"destination nodes. Please check the {dst_negative_field} feature " \ + f"of edge type {canonical_etype}" + + num_fixed_neg = fixed_negatives.shape[1] + logging.debug("The number of fixed negative is %d", num_fixed_neg) + return (src, None, pos_dst, fixed_negatives) if isinstance(pos_pairs, Mapping): pos_neg_tuple = {} diff --git a/tests/unit-tests/test_dataloading.py b/tests/unit-tests/test_dataloading.py index 025309e0aa..8c7194b37f 100644 --- a/tests/unit-tests/test_dataloading.py +++ b/tests/unit-tests/test_dataloading.py @@ -58,7 +58,8 @@ from graphstorm.dataloading.sampler import InbatchJointUniform from graphstorm.dataloading.sampler import GlobalUniform -from graphstorm.dataloading.sampler import GSHardEdgeDstNegativeSampler +from graphstorm.dataloading.sampler import (GSHardEdgeDstNegativeSampler, + GSFixedEdgeDstNegativeSampler) from graphstorm.dataloading.dataset import (prepare_batch_input, prepare_batch_edge_input) @@ -1719,252 +1720,47 @@ def check_none_hard_negs(hard_neg_sampler, target_etype, hard_neg_data): num_negs, "hard_negative", sampler, {}) check_none_hard_negs(hard_sampler, etype2, hard2) -def test_hard_edge_dst_negative_sample_gen_neg_pairs_complex_case(): - # test GSHardEdgeDstNegativeSampler.gen_neg_pairs with slow track when not all edges have enough predefined negatives - num_nodes = 1000 - # test GSHardEdgeDstNegativeSampler._generate when all some pos edges do not have enough hard negatives defined - num_negs = 10 - etype0, etype1, etype2, hard0, hard1, hard2, src, dst, g = _create_hard_neg_graph(num_nodes, num_negs) - - # not enough predefined hard negatives - # for hard0[0] and hard0[1] - hard0[0] = th.randperm(num_nodes)[:4] - hard0[0][-1] = -1 - hard0[0][-2] = -1 - hard0[1][-1] = -1 - - # not enough predefined hard negatives - # for hard0[0] and hard0[1] - hard1[0] = th.randperm(num_nodes)[:num_negs] - hard1[1] = th.randperm(num_nodes)[:num_negs] - hard1[0][-1] = -1 - hard1[0][-2] = -1 - hard1[1][-1] = -1 - - # not enough predefined hard negatives - # for hard0[0] and hard0[1] - hard2[0] = th.randperm(num_nodes)[:num_negs*2] - hard2[1] = th.randperm(num_nodes)[:num_negs*2] - hard2[0][-1] = -1 - hard2[0][-2] = -1 - hard2[1][-1] = -1 - - num_edges = 10 - pos_pairs = {etype0: (th.arange(10), th.arange(10)), - etype1: (th.arange(10), th.arange(10)), - etype2: (th.arange(10), th.arange(10))} - - def test_missing_hard_negs(neg_dst, hard_neg_data, num_hard_neg): - # hardx[0][-1] and hardx[0][-2] is -1, - # which means hardx[0] does not enough predefined negatives - # Random sample will be applied to -1s. - hard_neg_dst = neg_dst[0][:num_hard_neg] - hard_neg_rand_0 = hard_neg_dst[-1] - hard_neg_rand_1 = hard_neg_dst[-2] - hard_neg_dst = set(hard_neg_dst[:-2].tolist()) - rand_neg_dst = neg_dst[0][num_hard_neg:] - rand_neg_dst = set(rand_neg_dst.tolist()) - hard_neg_set = set(hard_neg_data[0].tolist()) - assert hard_neg_dst.issubset(hard_neg_set) - assert len(rand_neg_dst) == 0 or \ - rand_neg_dst.issubset(hard_neg_set) is False - - rand_0_check = hard_neg_rand_0 not in hard_neg_set - rand_1_check = hard_neg_rand_1 not in hard_neg_set - - # hardx[1][-1] is -1, - # which means hardx[0] does not enough predefined negatives - # Random sample will be applied to -1s. - hard_neg_dst = neg_dst[1][:num_hard_neg] - hard_neg_rand_2 = hard_neg_dst[-1] - hard_neg_dst = set(hard_neg_dst[:-1].tolist()) - rand_neg_dst = neg_dst[1][num_hard_neg:] - rand_neg_dst = set(rand_neg_dst.tolist()) - hard_neg_set = set(hard_neg_data[1].tolist()) - assert hard_neg_dst.issubset(hard_neg_set) - assert len(rand_neg_dst) == 0 or \ - rand_neg_dst.issubset(hard_neg_set) is False - - rand_2_check = hard_neg_rand_2 not in hard_neg_set - # The chance is very to to have rand_0_check, - # rand_1_check and rand_2_check be true at the same time - # The change is (4/1000)^3 - assert rand_0_check or rand_1_check or rand_2_check - - def check_hard_negs(pos_neg_tuple, etype, hard_neg_data, - num_hard_neg, check_missing_hard_neg): - neg_src, _, pos_dst, neg_dst = pos_neg_tuple[etype] - - assert len(neg_src) == num_edges - assert len(pos_dst) == num_edges - assert neg_dst.shape[0] == num_edges - assert neg_dst.shape[1] == num_negs - assert_equal(src[:10].numpy(), neg_src.numpy()) - assert_equal(dst[:10].numpy(), pos_dst.numpy()) - - if check_missing_hard_neg: - test_missing_hard_negs(neg_dst, hard_neg_data, num_hard_neg) - - start = 2 if check_missing_hard_neg else 0 - for i in range(start, num_edges): - hard_neg_dst = neg_dst[i][:num_hard_neg] - hard_neg_dst = set(hard_neg_dst.tolist()) - rand_neg_dst = neg_dst[i][num_hard_neg:] - rand_neg_dst = set(rand_neg_dst.tolist()) - hard_neg_set = set(hard_neg_data[i].tolist()) - assert hard_neg_dst.issubset(hard_neg_set) - assert len(rand_neg_dst) == 0 or \ - rand_neg_dst.issubset(hard_neg_set) is False - - sampler = GlobalUniform(num_negs) - hard_sampler = GSHardEdgeDstNegativeSampler(num_negs, "hard_negative", sampler) - pos_neg_tuple = hard_sampler.gen_neg_pairs(g, pos_pairs) - - # Case 1: - # 1. hard_negative field is string - # 2. The is not enough predefined negative for gen_neg_pairs - # 3. fast track - # 4. slow track (-1 exists in hard neg feature) - # - # expected behavior: - # 1. Only 4 hard negatives are returned - # 2. Others will be random negatives - check_hard_negs(pos_neg_tuple, etype0, hard0, hard0.shape[1], check_missing_hard_neg=True) - # Case 2: - # 1. hard_negative field is string - # 2. num_negs == total number of predefined negatives - # 3. fast track - # 4. slow track (-1 exists in hard neg feature) - # - # expected behavior: - # 1. all negatives are predefined negatives - check_hard_negs(pos_neg_tuple, etype1, hard1, hard1.shape[1], check_missing_hard_neg=True) - # Case 3: - # 1. hard_negative field is string - # 2. num_negs < total number of predefined negatives - # 3. fast track - # 4. slow track (-1 exists in hard neg feature) - # - # expected behavior: - # 1. all negatives are predefined negatives - check_hard_negs(pos_neg_tuple, etype2, hard2, num_negs, check_missing_hard_neg=False) - - -def test_hard_edge_dst_negative_sample_gen_neg_pairs(): +def test_edge_fixed_dst_negative_sample_gen_neg_pairs(): # test GSHardEdgeDstNegativeSampler.gen_neg_pairs with fast track when all edges have enough predefined negatives num_nodes = 1000 # test GSHardEdgeDstNegativeSampler._generate when all some pos edges do not have enough hard negatives defined num_negs = 10 - etype0, etype1, etype2, hard0, hard1, hard2, src, dst, g = _create_hard_neg_graph(num_nodes, num_negs) + etype0, etype1, etype2, hard0, _, _, src, dst, g = _create_hard_neg_graph(num_nodes, num_negs) num_edges = 10 pos_pairs = {etype0: (th.arange(10), th.arange(10)), etype1: (th.arange(10), th.arange(10)), etype2: (th.arange(10), th.arange(10))} - def check_hard_negs(pos_neg_tuple, etype, hard_neg_data, num_hard_neg): + def check_fixed_negs(pos_neg_tuple, etype, hard_neg_data): neg_src, _, pos_dst, neg_dst = pos_neg_tuple[etype] assert len(neg_src) == num_edges assert len(pos_dst) == num_edges assert neg_dst.shape[0] == num_edges - assert neg_dst.shape[1] == num_negs assert_equal(src[:10].numpy(), neg_src.numpy()) assert_equal(dst[:10].numpy(), pos_dst.numpy()) - # check hard negative - for i in range(num_edges): - hard_neg_dst = neg_dst[i][:num_hard_neg] - hard_neg_dst = set(hard_neg_dst.tolist()) - rand_neg_dst = neg_dst[i][num_hard_neg:] - rand_neg_dst = set(rand_neg_dst.tolist()) - hard_neg_set = set(hard_neg_data[i].tolist()) - assert hard_neg_dst.issubset(hard_neg_set) - assert len(rand_neg_dst) == 0 or \ - rand_neg_dst.issubset(hard_neg_set) is False + assert_equal(hard_neg_data[:10].numpy(), neg_dst.numpy()) - sampler = GlobalUniform(num_negs) - hard_sampler = GSHardEdgeDstNegativeSampler(num_negs, "hard_negative", sampler) + hard_sampler = GSFixedEdgeDstNegativeSampler("hard_negative") pos_neg_tuple = hard_sampler.gen_neg_pairs(g, pos_pairs) + check_fixed_negs(pos_neg_tuple, etype0, hard0) - # Case 1: - # 1. hard_negative field is string - # 2. The is not enough predefined negative for gen_neg_pairs - # 3. fast track - # - # expected behavior: - # 1. Only 4 hard negatives are returned - # 2. Others will be random negatives - check_hard_negs(pos_neg_tuple, etype0, hard0, hard0.shape[1]) - # Case 2: - # 1. hard_negative field is string - # 2. num_negs == total number of predefined negatives - # 3. fast track - # - # expected behavior: - # 1. all negatives are predefined negatives - check_hard_negs(pos_neg_tuple, etype1, hard1, hard1.shape[1]) - # Case 3: - # 1. hard_negative field is string - # 2. num_negs < total number of predefined negatives - # 3. fast track - # - # expected behavior: - # 1. all negatives are predefined negatives - check_hard_negs(pos_neg_tuple, etype2, hard2, num_negs) - - hard_sampler = GSHardEdgeDstNegativeSampler(num_negs, - {etype0: "hard_negative", + hard_sampler = GSFixedEdgeDstNegativeSampler({etype0: "hard_negative", etype1: "hard_negative", - etype2: "hard_negative"}, - sampler) - # Case 4: - # 1. hard_negative field is dict - # 2. The is not enough predefined negative for gen_neg_pairs - # 3. fast track - # - # expected behavior: - # 1. Only 4 hard negatives are returned - # 2. Others will be random negatives - check_hard_negs(pos_neg_tuple, etype0, hard0, hard0.shape[1]) - # Case 5: - # 1. hard_negative field is dict - # 2. num_negs == total number of predefined negatives - # 3. fast track - # - # expected behavior: - # 1. all negatives are predefined negatives - check_hard_negs(pos_neg_tuple, etype1, hard1, hard1.shape[1]) - # Case 6: - # 1. hard_negative field is dict - # 2. num_negs < total number of predefined negatives - # 3. fast track - # - # expected behavior: - # 1. all negatives are predefined negatives - check_hard_negs(pos_neg_tuple, etype2, hard2, num_negs) + etype2: "hard_negative"}) + check_fixed_negs(pos_neg_tuple, etype0, hard0) - def check_none_hard_negs(pos_neg_tuple, etype, hard_neg_data): - neg_src, _, pos_dst, neg_dst = pos_neg_tuple[etype] - - assert len(neg_src) == num_edges - assert len(pos_dst) == num_edges - assert neg_dst.shape[0] == num_edges - assert neg_dst.shape[1] == num_negs - assert_equal(src[:10].numpy(), neg_src.numpy()) - assert_equal(dst[:10].numpy(), pos_dst.numpy()) - - for i in range(num_edges): - hard_neg_dst = set(neg_dst[i].tolist()) - hard_neg_set = set(hard_neg_data[i].tolist()) - assert hard_neg_dst.issubset(hard_neg_set) is False + # each positive edge should have enough fixed negatives + hard0[0][-1] = -1 + fail = False + try: + pos_neg_tuple = hard_sampler.gen_neg_pairs(g, pos_pairs) + except: + fail = True + assert fail - # Case 9: - # dst_negative_field is not provided - hard_sampler = GSHardEdgeDstNegativeSampler( - num_negs, {}, sampler) - pos_neg_tuple = hard_sampler.gen_neg_pairs(g, pos_pairs) - check_none_hard_negs(pos_neg_tuple, etype2, hard2) @pytest.mark.parametrize("num_pos", [2, 10]) @pytest.mark.parametrize("num_neg", [5, 20]) @@ -1992,8 +1788,7 @@ def test_inbatch_joint_neg_sampler(num_pos, num_neg): if __name__ == '__main__': - test_hard_edge_dst_negative_sample_gen_neg_pairs_complex_case() - test_hard_edge_dst_negative_sample_gen_neg_pairs() + test_edge_fixed_dst_negative_sample_gen_neg_pairs() test_hard_edge_dst_negative_sample_generate_complex_case() test_hard_edge_dst_negative_sample_generate() test_inbatch_joint_neg_sampler(10, 20) From 9c0bb943b2b65113b7c3e6be84c98de1f966e3e9 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Tue, 19 Dec 2023 23:51:49 -0800 Subject: [PATCH 07/17] update --- python/graphstorm/dataloading/dataloading.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/python/graphstorm/dataloading/dataloading.py b/python/graphstorm/dataloading/dataloading.py index 53d4e4ec15..afa1650d28 100644 --- a/python/graphstorm/dataloading/dataloading.py +++ b/python/graphstorm/dataloading/dataloading.py @@ -34,7 +34,8 @@ InbatchJointUniform, FastMultiLayerNeighborSampler, DistributedFileSampler, - GSHardEdgeDstNegativeSampler) + GSHardEdgeDstNegativeSampler, + GSFixedEdgeDstNegativeSampler) from .utils import trim_data, modify_fanout_for_target_etype from .dataset import GSDistillData @@ -369,6 +370,7 @@ def fanout(self): BUILTIN_FAST_LP_JOINT_NEG_SAMPLER = 'fast_joint' BUILTIN_FAST_LP_LOCALUNIFORM_NEG_SAMPLER = 'fast_localuniform' BUILTIN_FAST_LP_LOCALJOINT_NEG_SAMPLER = 'fast_localjoint' +BUILTIN_LP_FIXED_NEG_SAMPLER = 'fixed' class GSgnnLinkPredictionDataLoaderBase(): """ The base class of link prediction dataloader. @@ -1012,11 +1014,13 @@ def _reinit_dataset(self): def _prepare_negative_sampler(self, num_negative_edges): # the default negative sampler is uniform sampler self._neg_sample_type = BUILTIN_LP_UNIFORM_NEG_SAMPLER - negative_sampler = GlobalUniform(num_negative_edges) + if self._fixed_edge_dst_negative_field: - negative_sampler = GSHardEdgeDstNegativeSampler(num_negative_edges, - self._fixed_edge_dst_negative_field, - negative_sampler) + negative_sampler = GSFixedEdgeDstNegativeSampler(self._fixed_edge_dst_negative_field) + self._neg_sample_type = BUILTIN_LP_FIXED_NEG_SAMPLER + else: + negative_sampler = GlobalUniform(num_negative_edges) + self._neg_sample_type = BUILTIN_LP_UNIFORM_NEG_SAMPLER return negative_sampler def __iter__(self): From 8420e56165233aed6829a50ba76a1c9703261880 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Wed, 20 Dec 2023 11:38:49 -0800 Subject: [PATCH 08/17] Update --- python/graphstorm/config/argument.py | 71 +++++++++--------------- python/graphstorm/dataloading/sampler.py | 23 +++++++- 2 files changed, 47 insertions(+), 47 deletions(-) diff --git a/python/graphstorm/config/argument.py b/python/graphstorm/config/argument.py index a017b915e8..da4cac5fab 100644 --- a/python/graphstorm/config/argument.py +++ b/python/graphstorm/config/argument.py @@ -1874,6 +1874,30 @@ def lp_edge_weight_for_loss(self): return None + def _get_predefined_negatives_per_etype(negatives): + if len(negatives) == 1 and \ + ":" not in negatives[0]: + # global feat_name + return int(negatives[0]) + + # per edge type feature + negative_dict = {} + for negative in negatives: + negative_info = negative.split(":") + assert len(negative_info) == 2, \ + "negative dstnode information must be provided in format of " \ + f"src,relation,dst:feature_name, but get {negative}" + + etype = tuple(negative_info[0].split(",")) + assert len(etype) == 3, \ + f"Edge type must in format of (src,relation,dst), but get {etype}" + assert etype not in negative_dict, \ + f"You already specify the fixed negative of {etype} " \ + f"as {negative_dict[etype]}" + + negative_dict[etype] = negative_info[1] + return negative_dict + @property def train_etypes_negative_dstnode(self): """ The list of canonical etypes that have hard negative edges @@ -1895,29 +1919,7 @@ def train_etypes_negative_dstnode(self): assert self.task_type == BUILTIN_TASK_LINK_PREDICTION, \ "Hard negative only works with link prediction" hard_negatives = self._train_etypes_negative_dstnode - if len(hard_negatives) == 1 and \ - ":" not in hard_negatives[0]: - # global feat_name - return hard_negatives[0] - - # per edge type feature - hard_negative_dict = {} - for hard_negative in hard_negatives: - negative_info = hard_negative.split(":") - assert len(negative_info) == 2, \ - "negative dstnode information must be provided in format of " \ - f"src,relation,dst:feature_name, but get {hard_negative}" - - etype = tuple(negative_info[0].split(",")) - assert len(etype) == 3, \ - f"Edge type must in format of (src,relation,dst), but get {etype}" - - assert etype not in hard_negative_dict, \ - f"You already specify the fixed negative of {etype} " \ - f"as {hard_negative_dict[etype]}" - - hard_negative_dict[etype] = negative_info[1] - return hard_negative_dict + return self._get_predefined_negatives_per_etype(hard_negatives) # By default fixed negative is not used return None @@ -1982,28 +1984,7 @@ def eval_etypes_negative_dstnode(self): assert self.task_type == BUILTIN_TASK_LINK_PREDICTION, \ "Fixed negative only works with link prediction" fixed_negatives = self._eval_etypes_negative_dstnode - if len(fixed_negatives) == 1 and \ - ":" not in fixed_negatives[0]: - # global feat_name - return fixed_negatives[0] - - # per edge type feature - fixed_negative_dict = {} - for fixed_negative in fixed_negatives: - negative_info = fixed_negative.split(":") - assert len(negative_info) == 2, \ - "negative dstnode information must be provided in format of " \ - f"src,relation,dst:feature_name, but get {fixed_negative}" - - etype = tuple(negative_info[0].split(",")) - assert len(etype) == 3, \ - f"Edge type must in format of (src,relation,dst), but get {etype}" - assert etype not in fixed_negative_dict, \ - f"You already specify the fixed negative of {etype} " \ - f"as {fixed_negative_dict[etype]}" - - fixed_negative_dict[etype] = negative_info[1] - return fixed_negative_dict + return self._get_predefined_negatives_per_etype(fixed_negatives) # By default fixed negative is not used return None diff --git a/python/graphstorm/dataloading/sampler.py b/python/graphstorm/dataloading/sampler.py index bd0cfa0c29..836f563126 100644 --- a/python/graphstorm/dataloading/sampler.py +++ b/python/graphstorm/dataloading/sampler.py @@ -24,7 +24,8 @@ from dgl import backend as F from dgl import EID, NID from dgl.distributed import node_split -from dgl.dataloading.negative_sampler import Uniform +from dgl.dataloading.negative_sampler import (Uniform, + _BaseNegativeSampler) from dgl.dataloading import NeighborSampler from dgl.transforms import to_block @@ -72,7 +73,7 @@ def _generate(self, g, eids, canonical_etype): dst = F.randint(shape, dtype, ctx, 0, self._local_neg_nids[vtype].shape[0]) return src, self._local_neg_nids[vtype][dst] -class GSHardEdgeDstNegativeSampler(object): +class GSHardEdgeDstNegativeSampler(_BaseNegativeSampler): """ GraphStorm negative sampler that chooses negative destination nodes from a fixed set to create negative edges. @@ -97,6 +98,10 @@ def __init__(self, k, dst_negative_field, negative_sampler, num_hard_negs=None): self._num_hard_negs = num_hard_negs def _generate(self, g, eids, canonical_etype): + """ _generate() is called by DGL BaseNegativeSampler to generate negative pairs. + + See https://github.com/dmlc/dgl/blob/1.1.x/python/dgl/dataloading/negative_sampler.py#L7 For more detials + """ if isinstance(self._dst_negative_field, str): dst_negative_field = self._dst_negative_field elif canonical_etype in self._dst_negative_field: @@ -174,10 +179,21 @@ def _generate(self, g, eids, canonical_etype): hard_negative[:num_hard_neg if num_hard_neg < self._k else self._k] return src, neg + def gen_neg_pairs(self, g, pos_pairs): + """ TODO: Do not support generating negative pairs for evaluation in the same way as + generating negative pairs for training now. + Please use GSFixedEdgeDstNegativeSampler instead. + """ + raise RuntimeError("Sampling negative edges for evaluation purpose" + "is not supported for GSHardEdgeDstNegativeSampler. " + "Please use GSFixedEdgeDstNegativeSampler instead.") + class GSFixedEdgeDstNegativeSampler(object): """ GraphStorm negative sampler that uses fixed negative destination nodes to create negative edges. + GSFixedEdgeDstNegativeSampler only works with test dataloader. + Parameters ---------- dst_negative_field: str or dict of str @@ -192,6 +208,9 @@ def gen_neg_pairs(self, g, pos_pairs): """ Returns negative examples associated with positive examples. It only return dst negatives. + This function is called by GSgnnLinkPredictionTestDataLoader._next_data() + to generate testing edges. + Parameters ---------- g : DGLGraph From 776a86221c4ef6565ab1c52c041ec847f220b62e Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Wed, 20 Dec 2023 11:44:04 -0800 Subject: [PATCH 09/17] Update --- python/graphstorm/config/argument.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/graphstorm/config/argument.py b/python/graphstorm/config/argument.py index da4cac5fab..e6ebc95f79 100644 --- a/python/graphstorm/config/argument.py +++ b/python/graphstorm/config/argument.py @@ -1953,7 +1953,12 @@ def num_train_hard_negatives(self): num_hard_negative_dict = {} for num_negative in num_negatives: negative_info = num_negative.split(":") + assert len(negative_info) == 2, \ + "Number of train hard negative information must be provided in format of " \ + f"src,relation,dst:10, but get {num_negative}" etype = tuple(negative_info[0].split(",")) + assert len(etype) == 3, \ + f"Edge type must in format of (src,relation,dst), but get {etype}" assert etype not in num_hard_negative_dict, \ f"You already specify the fixed negative of {etype} " \ f"as {num_hard_negative_dict[etype]}" From fc35ba15faa29942d35f5ae677ca5b63cae9f739 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Wed, 20 Dec 2023 12:41:48 -0800 Subject: [PATCH 10/17] Update --- python/graphstorm/dataloading/__init__.py | 8 +- python/graphstorm/dataloading/dataloading.py | 80 ++++++++++++++----- python/graphstorm/dataloading/sampler.py | 2 +- python/graphstorm/run/gsgnn_lp/gsgnn_lm_lp.py | 29 +++++-- python/graphstorm/run/gsgnn_lp/gsgnn_lp.py | 39 ++++++--- .../graphstorm/run/gsgnn_lp/lp_infer_gnn.py | 23 ++++-- python/graphstorm/run/gsgnn_lp/lp_infer_lm.py | 20 +++-- tests/unit-tests/test_dataloading.py | 65 ++++++++++++++- 8 files changed, 208 insertions(+), 58 deletions(-) diff --git a/python/graphstorm/dataloading/__init__.py b/python/graphstorm/dataloading/__init__.py index 1aed57bb12..20abce548d 100644 --- a/python/graphstorm/dataloading/__init__.py +++ b/python/graphstorm/dataloading/__init__.py @@ -24,8 +24,9 @@ from .dataloading import GSgnnAllEtypeLinkPredictionDataLoader from .dataloading import GSgnnEdgeDataLoader from .dataloading import GSgnnNodeDataLoader, GSgnnNodeSemiSupDataLoader -from .dataloading import GSgnnLinkPredictionTestDataLoader -from .dataloading import GSgnnLinkPredictionJointTestDataLoader +from .dataloading import (GSgnnLinkPredictionTestDataLoader, + GSgnnLinkPredictionJointTestDataLoader, + GSgnnLinkPredictionPredefinedTestDataLoader) from .dataloading import (FastGSgnnLinkPredictionDataLoader, FastGSgnnLPLocalJointNegDataLoader, FastGSgnnLPJointNegDataLoader, @@ -43,7 +44,8 @@ BUILTIN_LP_JOINT_NEG_SAMPLER, BUILTIN_LP_INBATCH_JOINT_NEG_SAMPLER, BUILTIN_LP_LOCALUNIFORM_NEG_SAMPLER, - BUILTIN_LP_LOCALJOINT_NEG_SAMPLER) + BUILTIN_LP_LOCALJOINT_NEG_SAMPLER, + BUILTIN_LP_FIXED_NEG_SAMPLER) from .dataloading import BUILTIN_LP_ALL_ETYPE_UNIFORM_NEG_SAMPLER from .dataloading import BUILTIN_LP_ALL_ETYPE_JOINT_NEG_SAMPLER from .dataloading import (BUILTIN_FAST_LP_UNIFORM_NEG_SAMPLER, diff --git a/python/graphstorm/dataloading/dataloading.py b/python/graphstorm/dataloading/dataloading.py index afa1650d28..517ec236aa 100644 --- a/python/graphstorm/dataloading/dataloading.py +++ b/python/graphstorm/dataloading/dataloading.py @@ -488,6 +488,10 @@ class GSgnnLinkPredictionDataLoader(GSgnnLinkPredictionDataLoaderBase): The node types that requires to construct node features. construct_feat_fanout : int The fanout required to construct node features. + edge_dst_negative_field: str or dict of str + The feature field(s) that store the hard negative edges for each edge type. + num_hard_negs: int or dict of int + The number of hard negatives per positive edge for each edge type Examples ------------ @@ -510,7 +514,9 @@ class GSgnnLinkPredictionDataLoader(GSgnnLinkPredictionDataLoaderBase): def __init__(self, dataset, target_idx, fanout, batch_size, num_negative_edges, device='cpu', train_task=True, reverse_edge_types_map=None, exclude_training_targets=False, edge_mask_for_gnn_embeddings='train_mask', - construct_feat_ntype=None, construct_feat_fanout=5): + construct_feat_ntype=None, construct_feat_fanout=5, + edge_dst_negative_field=None, + num_hard_negs=None): super().__init__(dataset, target_idx, fanout) self._device = device for etype in target_idx: @@ -524,7 +530,9 @@ def __init__(self, dataset, target_idx, fanout, batch_size, num_negative_edges, reverse_edge_types_map=reverse_edge_types_map, edge_mask_for_gnn_embeddings=edge_mask_for_gnn_embeddings, construct_feat_ntype=construct_feat_ntype, - construct_feat_fanout=construct_feat_fanout) + construct_feat_fanout=construct_feat_fanout, + edge_dst_negative_field=edge_dst_negative_field, + num_hard_negs=num_hard_negs) def _prepare_negative_sampler(self, num_negative_edges): # the default negative sampler is uniform sampler @@ -535,7 +543,8 @@ def _prepare_dataloader(self, dataset, target_idxs, fanout, num_negative_edges, batch_size, device, train_task=True, exclude_training_targets=False, reverse_edge_types_map=None, edge_mask_for_gnn_embeddings=None, construct_feat_ntype=None, - construct_feat_fanout=5): + construct_feat_fanout=5, edge_dst_negative_field=None, + num_hard_negs=None): g = dataset.g if construct_feat_ntype is None: construct_feat_ntype = [] @@ -556,6 +565,11 @@ def _prepare_dataloader(self, dataset, target_idxs, fanout, sampler = MultiLayerNeighborSamplerForReconstruct(sampler, dataset, construct_feat_ntype, construct_feat_fanout) negative_sampler = self._prepare_negative_sampler(num_negative_edges) + if edge_dst_negative_field is not None: + negative_sampler = GSHardEdgeDstNegativeSampler(num_negative_edges, + edge_dst_negative_field, + negative_sampler, + num_hard_negs) # edge loader if train_task: @@ -642,7 +656,8 @@ def _prepare_dataloader(self, dataset, target_idxs, fanout, num_negative_edges, batch_size, device, train_task=True, exclude_training_targets=False, reverse_edge_types_map=None, edge_mask_for_gnn_embeddings=None, construct_feat_ntype=None, - construct_feat_fanout=5): + construct_feat_fanout=5, edge_dst_negative_field=None, + num_hard_negs=None): g = dataset.g if construct_feat_ntype is None: construct_feat_ntype = [] @@ -663,6 +678,11 @@ def _prepare_dataloader(self, dataset, target_idxs, fanout, sampler = MultiLayerNeighborSamplerForReconstruct(sampler, dataset, construct_feat_ntype, construct_feat_fanout) negative_sampler = self._prepare_negative_sampler(num_negative_edges) + if edge_dst_negative_field is not None: + negative_sampler = GSHardEdgeDstNegativeSampler(num_negative_edges, + edge_dst_negative_field, + negative_sampler, + num_hard_negs) # edge loader if train_task: @@ -975,11 +995,9 @@ class GSgnnLinkPredictionTestDataLoader(): When test is huge, using fixed_test_size can save validation and test time. Default: None. - fixed_edge_dst_negative_field: str or list of str - The feature field(s) that store the fixed negative set for each edge. """ def __init__(self, dataset, target_idx, batch_size, num_negative_edges, - fanout=None, fixed_test_size=None, fixed_edge_dst_negative_field=None): + fanout=None, fixed_test_size=None): self._data = dataset self._fanout = fanout for etype in target_idx: @@ -996,7 +1014,6 @@ def __init__(self, dataset, target_idx, batch_size, num_negative_edges, "is %d, which is smaller than the expected" "test size %d, force it to %d", etype, len(t_idx), self._fixed_test_size[etype], len(t_idx)) - self._fixed_edge_dst_negative_field = fixed_edge_dst_negative_field self._negative_sampler = self._prepare_negative_sampler(num_negative_edges) self._reinit_dataset() @@ -1014,13 +1031,7 @@ def _reinit_dataset(self): def _prepare_negative_sampler(self, num_negative_edges): # the default negative sampler is uniform sampler self._neg_sample_type = BUILTIN_LP_UNIFORM_NEG_SAMPLER - - if self._fixed_edge_dst_negative_field: - negative_sampler = GSFixedEdgeDstNegativeSampler(self._fixed_edge_dst_negative_field) - self._neg_sample_type = BUILTIN_LP_FIXED_NEG_SAMPLER - else: - negative_sampler = GlobalUniform(num_negative_edges) - self._neg_sample_type = BUILTIN_LP_UNIFORM_NEG_SAMPLER + negative_sampler = GlobalUniform(num_negative_edges) return negative_sampler def __iter__(self): @@ -1069,10 +1080,41 @@ def _prepare_negative_sampler(self, num_negative_edges): # the default negative sampler is uniform sampler negative_sampler = JointUniform(num_negative_edges) self._neg_sample_type = BUILTIN_LP_JOINT_NEG_SAMPLER - if self._fixed_edge_dst_negative_field: - negative_sampler = GSHardEdgeDstNegativeSampler(num_negative_edges, - self._fixed_edge_dst_negative_field, - negative_sampler) + return negative_sampler + +class GSgnnLinkPredictionPredefinedTestDataLoader(GSgnnLinkPredictionTestDataLoader): + """ Link prediction minibatch dataloader for validation and test + with predefined negatives. + + Parameters + ----------- + dataset: GSgnnEdgeData + The GraphStorm edge dataset + target_idx : dict of Tensors + The target edges for prediction + batch_size: int + Batch size + fanout: int + Evaluation fanout for computing node embedding + fixed_test_size: int + Fixed number of test data used in evaluation. + If it is none, use the whole testset. + When test is huge, using fixed_test_size + can save validation and test time. + Default: None. + fixed_edge_dst_negative_field: str or list of str + The feature field(s) that store the fixed negative set for each edge. + """ + def __init__(self, dataset, target_idx, batch_size, fixed_edge_dst_negative_field, + fanout=None, fixed_test_size=None): + self._fixed_edge_dst_negative_field = fixed_edge_dst_negative_field + super().__init__(dataset, target_idx, batch_size, + num_negative_edges=0, # num_negative_edges is not used + fanout=fanout, fixed_test_size=fixed_test_size) + + def _prepare_negative_sampler(self, _): + negative_sampler = GSFixedEdgeDstNegativeSampler(self._fixed_edge_dst_negative_field) + self._neg_sample_type = BUILTIN_LP_FIXED_NEG_SAMPLER return negative_sampler ################ Minibatch DataLoader (Node classification) ####################### diff --git a/python/graphstorm/dataloading/sampler.py b/python/graphstorm/dataloading/sampler.py index 836f563126..fc474d85e9 100644 --- a/python/graphstorm/dataloading/sampler.py +++ b/python/graphstorm/dataloading/sampler.py @@ -179,7 +179,7 @@ def _generate(self, g, eids, canonical_etype): hard_negative[:num_hard_neg if num_hard_neg < self._k else self._k] return src, neg - def gen_neg_pairs(self, g, pos_pairs): + def gen_neg_pairs(self, _): """ TODO: Do not support generating negative pairs for evaluation in the same way as generating negative pairs for training now. Please use GSFixedEdgeDstNegativeSampler instead. diff --git a/python/graphstorm/run/gsgnn_lp/gsgnn_lm_lp.py b/python/graphstorm/run/gsgnn_lp/gsgnn_lm_lp.py index 143fd58e06..de18c5cc60 100644 --- a/python/graphstorm/run/gsgnn_lp/gsgnn_lm_lp.py +++ b/python/graphstorm/run/gsgnn_lp/gsgnn_lm_lp.py @@ -30,8 +30,9 @@ GSgnnLPInBatchJointNegDataLoader) from graphstorm.dataloading import GSgnnAllEtypeLPJointNegDataLoader from graphstorm.dataloading import GSgnnAllEtypeLinkPredictionDataLoader -from graphstorm.dataloading import GSgnnLinkPredictionTestDataLoader -from graphstorm.dataloading import GSgnnLinkPredictionJointTestDataLoader +from graphstorm.dataloading import (GSgnnLinkPredictionTestDataLoader, + GSgnnLinkPredictionJointTestDataLoader, + GSgnnLinkPredictionPredefinedTestDataLoader) from graphstorm.dataloading import (BUILTIN_LP_UNIFORM_NEG_SAMPLER, BUILTIN_LP_JOINT_NEG_SAMPLER, BUILTIN_LP_INBATCH_JOINT_NEG_SAMPLER, @@ -129,10 +130,14 @@ def main(config_args): raise ValueError('Unknown negative sampler') dataloader = dataloader_cls(train_data, train_data.train_idxs, [], config.batch_size, config.num_negative_edges, device, - train_task=True) + train_task=True, + edge_dst_negative_field=config.train_etypes_negative_dstnode, + num_hard_negs=config.num_train_hard_negatives) # TODO(zhengda) let's use full-graph inference for now. - if config.eval_negative_sampler == BUILTIN_LP_UNIFORM_NEG_SAMPLER: + if config.eval_etypes_negative_dstnode is not None: + test_dataloader_cls = GSgnnLinkPredictionPredefinedTestDataLoader + elif config.eval_negative_sampler == BUILTIN_LP_UNIFORM_NEG_SAMPLER: test_dataloader_cls = GSgnnLinkPredictionTestDataLoader elif config.eval_negative_sampler == BUILTIN_LP_JOINT_NEG_SAMPLER: test_dataloader_cls = GSgnnLinkPredictionJointTestDataLoader @@ -143,11 +148,19 @@ def main(config_args): val_dataloader = None test_dataloader = None if len(train_data.val_idxs) > 0: - val_dataloader = test_dataloader_cls(train_data, train_data.val_idxs, - config.eval_batch_size, config.num_negative_edges_eval) + if config.eval_etypes_negative_dstnode is not None: + val_dataloader = test_dataloader_cls(train_data, train_data.val_idxs, + config.eval_batch_size, config.eval_etypes_negative_dstnode) + else: + val_dataloader = test_dataloader_cls(train_data, train_data.val_idxs, + config.eval_batch_size, config.num_negative_edges_eval) if len(train_data.test_idxs) > 0: - test_dataloader = test_dataloader_cls(train_data, train_data.test_idxs, - config.eval_batch_size, config.num_negative_edges_eval) + if config.eval_etypes_negative_dstnode is not None: + test_dataloader = test_dataloader_cls(train_data, train_data.test_idxs, + config.eval_batch_size, config.eval_etypes_negative_dstnode) + else: + test_dataloader = test_dataloader_cls(train_data, train_data.test_idxs, + config.eval_batch_size, config.num_negative_edges_eval) # Preparing input layer for training or inference. # The input layer can pre-compute node features in the preparing step if needed. diff --git a/python/graphstorm/run/gsgnn_lp/gsgnn_lp.py b/python/graphstorm/run/gsgnn_lp/gsgnn_lp.py index 92a40dc737..1a843e18fa 100644 --- a/python/graphstorm/run/gsgnn_lp/gsgnn_lp.py +++ b/python/graphstorm/run/gsgnn_lp/gsgnn_lp.py @@ -30,8 +30,9 @@ GSgnnLPInBatchJointNegDataLoader) from graphstorm.dataloading import GSgnnAllEtypeLPJointNegDataLoader from graphstorm.dataloading import GSgnnAllEtypeLinkPredictionDataLoader -from graphstorm.dataloading import GSgnnLinkPredictionTestDataLoader -from graphstorm.dataloading import GSgnnLinkPredictionJointTestDataLoader +from graphstorm.dataloading import (GSgnnLinkPredictionTestDataLoader, + GSgnnLinkPredictionJointTestDataLoader, + GSgnnLinkPredictionPredefinedTestDataLoader) from graphstorm.dataloading import (BUILTIN_LP_UNIFORM_NEG_SAMPLER, BUILTIN_LP_JOINT_NEG_SAMPLER, BUILTIN_LP_INBATCH_JOINT_NEG_SAMPLER, @@ -152,10 +153,14 @@ def main(config_args): reverse_edge_types_map=config.reverse_edge_types_map, exclude_training_targets=config.exclude_training_targets, construct_feat_ntype=config.construct_feat_ntype, - construct_feat_fanout=config.construct_feat_fanout) + construct_feat_fanout=config.construct_feat_fanout, + edge_dst_negative_field=config.train_etypes_negative_dstnode, + num_hard_negs=config.num_train_hard_negatives) # TODO(zhengda) let's use full-graph inference for now. - if config.eval_negative_sampler == BUILTIN_LP_UNIFORM_NEG_SAMPLER: + if config.eval_etypes_negative_dstnode is not None: + test_dataloader_cls = GSgnnLinkPredictionPredefinedTestDataLoader + elif config.eval_negative_sampler == BUILTIN_LP_UNIFORM_NEG_SAMPLER: test_dataloader_cls = GSgnnLinkPredictionTestDataLoader elif config.eval_negative_sampler == BUILTIN_LP_JOINT_NEG_SAMPLER: test_dataloader_cls = GSgnnLinkPredictionJointTestDataLoader @@ -166,13 +171,27 @@ def main(config_args): val_dataloader = None test_dataloader = None if len(train_data.val_idxs) > 0: - val_dataloader = test_dataloader_cls(train_data, train_data.val_idxs, - config.eval_batch_size, config.num_negative_edges_eval, config.eval_fanout, - fixed_test_size=config.fixed_test_size) + if config.eval_etypes_negative_dstnode is not None: + val_dataloader = test_dataloader_cls(train_data, train_data.val_idxs, + config.eval_batch_size, + fixed_edge_dst_negative_field=config.eval_etypes_negative_dstnode, + fanout=config.eval_fanout, + fixed_test_size=config.fixed_test_size) + else: + val_dataloader = test_dataloader_cls(train_data, train_data.val_idxs, + config.eval_batch_size, config.num_negative_edges_eval, config.eval_fanout, + fixed_test_size=config.fixed_test_size) if len(train_data.test_idxs) > 0: - test_dataloader = test_dataloader_cls(train_data, train_data.test_idxs, - config.eval_batch_size, config.num_negative_edges_eval, config.eval_fanout, - fixed_test_size=config.fixed_test_size) + if config.eval_etypes_negative_dstnode is not None: + test_dataloader = test_dataloader_cls(train_data, train_data.test_idxs, + config.eval_batch_size, + fixed_edge_dst_negative_field=config.eval_etypes_negative_dstnode, + fanout=config.eval_fanout, + fixed_test_size=config.fixed_test_size) + else: + test_dataloader = test_dataloader_cls(train_data, train_data.test_idxs, + config.eval_batch_size, config.num_negative_edges_eval, config.eval_fanout, + fixed_test_size=config.fixed_test_size) # Preparing input layer for training or inference. # The input layer can pre-compute node features in the preparing step if needed. diff --git a/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py b/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py index 50a2e97acc..3d6a854999 100644 --- a/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py +++ b/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py @@ -22,8 +22,9 @@ from graphstorm.inference import GSgnnLinkPredictionInferrer from graphstorm.eval import GSgnnMrrLPEvaluator from graphstorm.dataloading import GSgnnEdgeInferData -from graphstorm.dataloading import GSgnnLinkPredictionTestDataLoader -from graphstorm.dataloading import GSgnnLinkPredictionJointTestDataLoader +from graphstorm.dataloading import (GSgnnLinkPredictionTestDataLoader, + GSgnnLinkPredictionJointTestDataLoader, + GSgnnLinkPredictionPredefinedTestDataLoader) from graphstorm.dataloading import BUILTIN_LP_UNIFORM_NEG_SAMPLER from graphstorm.dataloading import BUILTIN_LP_JOINT_NEG_SAMPLER from graphstorm.utils import setup_device, get_lm_ntypes @@ -58,7 +59,9 @@ def main(config_args): tracker = gs.create_builtin_task_tracker(config) infer.setup_task_tracker(tracker) # We only support full-graph inference for now. - if config.eval_negative_sampler == BUILTIN_LP_UNIFORM_NEG_SAMPLER: + if config.eval_etypes_negative_dstnode is not None: + test_dataloader_cls = GSgnnLinkPredictionPredefinedTestDataLoader + elif config.eval_negative_sampler == BUILTIN_LP_UNIFORM_NEG_SAMPLER: test_dataloader_cls = GSgnnLinkPredictionTestDataLoader elif config.eval_negative_sampler == BUILTIN_LP_JOINT_NEG_SAMPLER: test_dataloader_cls = GSgnnLinkPredictionJointTestDataLoader @@ -67,10 +70,16 @@ def main(config_args): 'Supported test negative samplers include ' f'[{BUILTIN_LP_UNIFORM_NEG_SAMPLER}, {BUILTIN_LP_JOINT_NEG_SAMPLER}]') - dataloader = test_dataloader_cls(infer_data, infer_data.test_idxs, - batch_size=config.eval_batch_size, - num_negative_edges=config.num_negative_edges_eval, - fanout=config.eval_fanout) + if config.eval_etypes_negative_dstnode is not None: + dataloader = test_dataloader_cls(infer_data, infer_data.test_idxs, + batch_size=config.eval_batch_size, + fixed_edge_dst_negative_field=config.eval_etypes_negative_dstnode, + fanout=config.eval_fanout) + else: + dataloader = test_dataloader_cls(infer_data, infer_data.test_idxs, + batch_size=config.eval_batch_size, + num_negative_edges=config.num_negative_edges_eval, + fanout=config.eval_fanout) infer.infer(infer_data, dataloader, save_embed_path=config.save_embed_path, edge_mask_for_gnn_embeddings=None if config.no_validation else \ diff --git a/python/graphstorm/run/gsgnn_lp/lp_infer_lm.py b/python/graphstorm/run/gsgnn_lp/lp_infer_lm.py index e196d3fd83..baec5082e6 100644 --- a/python/graphstorm/run/gsgnn_lp/lp_infer_lm.py +++ b/python/graphstorm/run/gsgnn_lp/lp_infer_lm.py @@ -23,8 +23,9 @@ from graphstorm.inference import GSgnnLinkPredictionInferrer from graphstorm.eval import GSgnnMrrLPEvaluator from graphstorm.dataloading import GSgnnEdgeInferData -from graphstorm.dataloading import GSgnnLinkPredictionTestDataLoader -from graphstorm.dataloading import GSgnnLinkPredictionJointTestDataLoader +from graphstorm.dataloading import (GSgnnLinkPredictionTestDataLoader, + GSgnnLinkPredictionJointTestDataLoader, + GSgnnLinkPredictionPredefinedTestDataLoader) from graphstorm.dataloading import BUILTIN_LP_UNIFORM_NEG_SAMPLER from graphstorm.dataloading import BUILTIN_LP_JOINT_NEG_SAMPLER from graphstorm.utils import setup_device @@ -58,7 +59,9 @@ def main(config_args): tracker = gs.create_builtin_task_tracker(config) infer.setup_task_tracker(tracker) # We only support full-graph inference for now. - if config.eval_negative_sampler == BUILTIN_LP_UNIFORM_NEG_SAMPLER: + if config.eval_etypes_negative_dstnode is not None: + test_dataloader_cls = GSgnnLinkPredictionPredefinedTestDataLoader + elif config.eval_negative_sampler == BUILTIN_LP_UNIFORM_NEG_SAMPLER: test_dataloader_cls = GSgnnLinkPredictionTestDataLoader elif config.eval_negative_sampler == BUILTIN_LP_JOINT_NEG_SAMPLER: test_dataloader_cls = GSgnnLinkPredictionJointTestDataLoader @@ -67,9 +70,14 @@ def main(config_args): 'Supported test negative samplers include ' f'[{BUILTIN_LP_UNIFORM_NEG_SAMPLER}, {BUILTIN_LP_JOINT_NEG_SAMPLER}]') - dataloader = test_dataloader_cls(infer_data, infer_data.test_idxs, - batch_size=config.eval_batch_size, - num_negative_edges=config.num_negative_edges_eval) + if config.eval_etypes_negative_dstnode is not None: + dataloader = test_dataloader_cls(infer_data, infer_data.test_idxs, + batch_size=config.eval_batch_size, + fixed_edge_dst_negative_field=config.eval_etypes_negative_dstnode) + else: + dataloader = test_dataloader_cls(infer_data, infer_data.test_idxs, + batch_size=config.eval_batch_size, + num_negative_edges=config.num_negative_edges_eval) # Preparing input layer for training or inference. # The input layer can pre-compute node features in the preparing step if needed. # For example pre-compute all BERT embeddings diff --git a/tests/unit-tests/test_dataloading.py b/tests/unit-tests/test_dataloading.py index 8c7194b37f..f965858115 100644 --- a/tests/unit-tests/test_dataloading.py +++ b/tests/unit-tests/test_dataloading.py @@ -49,12 +49,14 @@ FastGSgnnLPJointNegDataLoader, FastGSgnnLPLocalUniformNegDataLoader, FastGSgnnLPLocalJointNegDataLoader) -from graphstorm.dataloading import GSgnnLinkPredictionTestDataLoader -from graphstorm.dataloading import GSgnnLinkPredictionJointTestDataLoader +from graphstorm.dataloading import (GSgnnLinkPredictionTestDataLoader, + GSgnnLinkPredictionJointTestDataLoader, + GSgnnLinkPredictionPredefinedTestDataLoader) from graphstorm.dataloading import DistillDataloaderGenerator, DistillDataManager from graphstorm.dataloading import DistributedFileSampler -from graphstorm.dataloading import BUILTIN_LP_UNIFORM_NEG_SAMPLER -from graphstorm.dataloading import BUILTIN_LP_JOINT_NEG_SAMPLER +from graphstorm.dataloading import (BUILTIN_LP_UNIFORM_NEG_SAMPLER, + BUILTIN_LP_JOINT_NEG_SAMPLER, + BUILTIN_LP_FIXED_NEG_SAMPLER) from graphstorm.dataloading.sampler import InbatchJointUniform from graphstorm.dataloading.sampler import GlobalUniform @@ -664,6 +666,59 @@ def test_GSgnnLinkPredictionTestDataLoader(batch_size, num_negative_edges): # after test pass, destroy all process group th.distributed.destroy_process_group() +def test_GSgnnLinkPredictionPredefinedTestDataLoader(batch_size): + th.distributed.init_process_group(backend='gloo', + init_method='tcp://127.0.0.1:23456', + rank=0, + world_size=1) + test_etypes = [("n0", "r1", "n1"), ("n0", "r0", "n1")] + with tempfile.TemporaryDirectory() as tmpdirname: + # get the test dummy distributed graph + _, part_config = generate_dummy_dist_graph(graph_name='dummy', dirname=tmpdirname) + lp_data = GSgnnEdgeInferData(graph_name='dummy', part_config=part_config, + eval_etypes=test_etypes) + g = lp_data.g + g.edges[("n0", "r1", "n1")].data["neg"] = th.randint(g.num_nodes("n1"), + (g.num_edges(("n0", "r1", "n1")), 10)) + g.edges[("n0", "r0", "n1")].data["neg"] = th.randint(g.num_nodes("n1"), + (g.num_edges(("n0", "r0", "n1")), 10)) + + dataloader = GSgnnLinkPredictionPredefinedTestDataLoader( + lp_data, + target_idx=lp_data.infer_idxs, # use train edges as val or test edges + batch_size=batch_size, + num_negative_edges=0, + fixed_edge_dst_negative_field="neg") + + total_edges = {etype: len(lp_data.infer_idxs[etype]) for etype in test_etypes} + num_pos_edges = {etype: 0 for etype in test_etypes} + for pos_neg_tuple, sample_type in dataloader: + assert sample_type == BUILTIN_LP_FIXED_NEG_SAMPLER + assert isinstance(pos_neg_tuple, dict) + assert len(pos_neg_tuple) == 2 + for canonical_etype, pos_neg in pos_neg_tuple.items(): + assert len(pos_neg) == 4 + pos_src, _, pos_dst, neg_dst = pos_neg + assert pos_src.shape == pos_dst.shape + assert pos_src.shape[0] == batch_size \ + if num_pos_edges[canonical_etype] + batch_size < total_edges[canonical_etype] \ + else total_edges[canonical_etype] - num_pos_edges[canonical_etype] + eid = lp_data.train_idxs[canonical_etype][num_pos_edges[canonical_etype]: \ + num_pos_edges[canonical_etype]+batch_size] \ + if num_pos_edges[canonical_etype]+batch_size < total_edges[canonical_etype] \ + else lp_data.train_idxs[canonical_etype] \ + [num_pos_edges[canonical_etype]:] + src, dst = g.find_edges(eid, etype=canonical_etype) + assert_equal(pos_src.numpy(), src.numpy()) + assert_equal(pos_dst.numpy(), dst.numpy()) + assert len(neg_dst.shape) == 2 + assert neg_dst.shape[1] == 10 + assert_equal(neg_dst.numpy(), g.edges[canonical_etype].data["neg"][eid].numpy()) + + num_pos_edges[canonical_etype] += batch_size + # after test pass, destroy all process group + th.distributed.destroy_process_group() + # initialize the torch distributed environment @pytest.mark.parametrize("batch_size", [1, 10, 128]) @pytest.mark.parametrize("num_negative_edges", [1, 16, 128]) @@ -1788,6 +1843,8 @@ def test_inbatch_joint_neg_sampler(num_pos, num_neg): if __name__ == '__main__': + test_GSgnnLinkPredictionPredefinedTestDataLoader(1) + test_GSgnnLinkPredictionPredefinedTestDataLoader(10) test_edge_fixed_dst_negative_sample_gen_neg_pairs() test_hard_edge_dst_negative_sample_generate_complex_case() test_hard_edge_dst_negative_sample_generate() From 440b8c0024915ec3f1d512fc4cdb5574b319b218 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Wed, 20 Dec 2023 14:02:18 -0800 Subject: [PATCH 11/17] Fix bugs --- python/graphstorm/dataloading/dataloading.py | 14 +++++++ python/graphstorm/dataloading/sampler.py | 43 +++++--------------- tests/unit-tests/test_dataloading.py | 25 +++++++----- 3 files changed, 40 insertions(+), 42 deletions(-) diff --git a/python/graphstorm/dataloading/dataloading.py b/python/graphstorm/dataloading/dataloading.py index 517ec236aa..ff0c6861c5 100644 --- a/python/graphstorm/dataloading/dataloading.py +++ b/python/graphstorm/dataloading/dataloading.py @@ -1117,6 +1117,20 @@ def _prepare_negative_sampler(self, _): self._neg_sample_type = BUILTIN_LP_FIXED_NEG_SAMPLER return negative_sampler + def _next_data(self, etype): + """ Get postive edges for the next iteration for a specific edge type + """ + g = self._data.g + current_pos = self._current_pos[etype] + end_of_etype = current_pos + self._batch_size >= self._fixed_test_size[etype] + + pos_eids = self._target_idx[etype][current_pos:self._fixed_test_size[etype]] \ + if end_of_etype \ + else self._target_idx[etype][current_pos:current_pos+self._batch_size] + pos_neg_tuple = self._negative_sampler.gen_etype_neg_pairs(g, etype, pos_eids) + self._current_pos[etype] += self._batch_size + return pos_neg_tuple, end_of_etype + ################ Minibatch DataLoader (Node classification) ####################### class GSgnnNodeDataLoaderBase(): diff --git a/python/graphstorm/dataloading/sampler.py b/python/graphstorm/dataloading/sampler.py index fc474d85e9..c8f457706d 100644 --- a/python/graphstorm/dataloading/sampler.py +++ b/python/graphstorm/dataloading/sampler.py @@ -179,15 +179,6 @@ def _generate(self, g, eids, canonical_etype): hard_negative[:num_hard_neg if num_hard_neg < self._k else self._k] return src, neg - def gen_neg_pairs(self, _): - """ TODO: Do not support generating negative pairs for evaluation in the same way as - generating negative pairs for training now. - Please use GSFixedEdgeDstNegativeSampler instead. - """ - raise RuntimeError("Sampling negative edges for evaluation purpose" - "is not supported for GSHardEdgeDstNegativeSampler. " - "Please use GSFixedEdgeDstNegativeSampler instead.") - class GSFixedEdgeDstNegativeSampler(object): """ GraphStorm negative sampler that uses fixed negative destination nodes to create negative edges. @@ -204,7 +195,7 @@ def __init__(self, dst_negative_field): "Hard negative is not supported for WholeGraph." self._dst_negative_field = dst_negative_field - def gen_neg_pairs(self, g, pos_pairs): + def gen_etype_neg_pairs(self, g, etype, pos_eids): """ Returns negative examples associated with positive examples. It only return dst negatives. @@ -215,33 +206,25 @@ def gen_neg_pairs(self, g, pos_pairs): ---------- g : DGLGraph The graph. - pos_pairs : (Tensor, Tensor) or dict[etype, (Tensor, Tensor)] - The positive node pairs + pos_eids : (Tensor, Tensor) or dict[etype, (Tensor, Tensor)] + The positive edge ids. Returns ------- - tuple[Tensor, Tensor, Tensor, Tensor] or dict[etype, tuple(Tensor, Tensor Tensor, Tensor) The returned [positive source, negative source, postive destination, negatve destination] tuples as pos-neg examples. """ - def _gen_neg_pair(pos_pair, canonical_etype): - src, pos_dst = pos_pair - eids = g.edge_ids(src, pos_dst, etype=canonical_etype) + def _gen_neg_pair(eids, canonical_etype): + src, pos_dst = g.find_edges(eids, etype=canonical_etype) if isinstance(self._dst_negative_field, str): dst_negative_field = self._dst_negative_field elif canonical_etype in self._dst_negative_field: dst_negative_field = self._dst_negative_field[canonical_etype] else: - dst_negative_field = None - - if dst_negative_field is None: - random_neg_pairs = \ - self._negative_sampler.gen_neg_pairs(g, {canonical_etype:pos_pair}) - src, _, pos_dst, neg_dst = random_neg_pairs[canonical_etype] - return (src, None, pos_dst, neg_dst) + raise RuntimeError(f"{etype} does not have pre-defined negatives") fixed_negatives = g.edges[canonical_etype].data[dst_negative_field][eids] @@ -257,15 +240,11 @@ def _gen_neg_pair(pos_pair, canonical_etype): logging.debug("The number of fixed negative is %d", num_fixed_neg) return (src, None, pos_dst, fixed_negatives) - if isinstance(pos_pairs, Mapping): - pos_neg_tuple = {} - for canonical_etype, pos_pair in pos_pairs.items(): - pos_neg_tuple[canonical_etype] = _gen_neg_pair(pos_pair, canonical_etype) - else: - assert len(g.canonical_etypes) == 1, \ - 'please specify a dict of etypes and ids for graphs with multiple edge types' - pos_neg_tuple = _gen_neg_pair(pos_pairs, canonical_etype) - return pos_neg_tuple + assert etype in g.canonical_etypes, \ + f"Edge type {etype} does not exist in graph. Expecting an edge type in " \ + f"{g.canonical_etypes}, but get {etype}" + + return {etype: _gen_neg_pair(pos_eids, etype)} class GlobalUniform(Uniform): """Negative sampler that randomly chooses negative destination nodes diff --git a/tests/unit-tests/test_dataloading.py b/tests/unit-tests/test_dataloading.py index f965858115..076f49a884 100644 --- a/tests/unit-tests/test_dataloading.py +++ b/tests/unit-tests/test_dataloading.py @@ -687,7 +687,6 @@ def test_GSgnnLinkPredictionPredefinedTestDataLoader(batch_size): lp_data, target_idx=lp_data.infer_idxs, # use train edges as val or test edges batch_size=batch_size, - num_negative_edges=0, fixed_edge_dst_negative_field="neg") total_edges = {etype: len(lp_data.infer_idxs[etype]) for etype in test_etypes} @@ -695,7 +694,7 @@ def test_GSgnnLinkPredictionPredefinedTestDataLoader(batch_size): for pos_neg_tuple, sample_type in dataloader: assert sample_type == BUILTIN_LP_FIXED_NEG_SAMPLER assert isinstance(pos_neg_tuple, dict) - assert len(pos_neg_tuple) == 2 + assert len(pos_neg_tuple) == 1 for canonical_etype, pos_neg in pos_neg_tuple.items(): assert len(pos_neg) == 4 pos_src, _, pos_dst, neg_dst = pos_neg @@ -703,10 +702,10 @@ def test_GSgnnLinkPredictionPredefinedTestDataLoader(batch_size): assert pos_src.shape[0] == batch_size \ if num_pos_edges[canonical_etype] + batch_size < total_edges[canonical_etype] \ else total_edges[canonical_etype] - num_pos_edges[canonical_etype] - eid = lp_data.train_idxs[canonical_etype][num_pos_edges[canonical_etype]: \ + eid = lp_data.infer_idxs[canonical_etype][num_pos_edges[canonical_etype]: \ num_pos_edges[canonical_etype]+batch_size] \ if num_pos_edges[canonical_etype]+batch_size < total_edges[canonical_etype] \ - else lp_data.train_idxs[canonical_etype] \ + else lp_data.infer_idxs[canonical_etype] \ [num_pos_edges[canonical_etype]:] src, dst = g.find_edges(eid, etype=canonical_etype) assert_equal(pos_src.numpy(), src.numpy()) @@ -1780,12 +1779,9 @@ def test_edge_fixed_dst_negative_sample_gen_neg_pairs(): num_nodes = 1000 # test GSHardEdgeDstNegativeSampler._generate when all some pos edges do not have enough hard negatives defined num_negs = 10 - etype0, etype1, etype2, hard0, _, _, src, dst, g = _create_hard_neg_graph(num_nodes, num_negs) + etype0, etype1, etype2, hard0, hard1, hard2, src, dst, g = _create_hard_neg_graph(num_nodes, num_negs) num_edges = 10 - pos_pairs = {etype0: (th.arange(10), th.arange(10)), - etype1: (th.arange(10), th.arange(10)), - etype2: (th.arange(10), th.arange(10))} def check_fixed_negs(pos_neg_tuple, etype, hard_neg_data): neg_src, _, pos_dst, neg_dst = pos_neg_tuple[etype] @@ -1799,19 +1795,28 @@ def check_fixed_negs(pos_neg_tuple, etype, hard_neg_data): assert_equal(hard_neg_data[:10].numpy(), neg_dst.numpy()) hard_sampler = GSFixedEdgeDstNegativeSampler("hard_negative") - pos_neg_tuple = hard_sampler.gen_neg_pairs(g, pos_pairs) + pos_neg_tuple = hard_sampler.gen_etype_neg_pairs(g, etype0, th.arange(10)) check_fixed_negs(pos_neg_tuple, etype0, hard0) + pos_neg_tuple = hard_sampler.gen_etype_neg_pairs(g, etype1, th.arange(10)) + check_fixed_negs(pos_neg_tuple, etype1, hard1) + pos_neg_tuple = hard_sampler.gen_etype_neg_pairs(g, etype2, th.arange(10)) + check_fixed_negs(pos_neg_tuple, etype2, hard2) hard_sampler = GSFixedEdgeDstNegativeSampler({etype0: "hard_negative", etype1: "hard_negative", etype2: "hard_negative"}) + pos_neg_tuple = hard_sampler.gen_etype_neg_pairs(g, etype0, th.arange(10)) check_fixed_negs(pos_neg_tuple, etype0, hard0) + pos_neg_tuple = hard_sampler.gen_etype_neg_pairs(g, etype1, th.arange(10)) + check_fixed_negs(pos_neg_tuple, etype1, hard1) + pos_neg_tuple = hard_sampler.gen_etype_neg_pairs(g, etype2, th.arange(10)) + check_fixed_negs(pos_neg_tuple, etype2, hard2) # each positive edge should have enough fixed negatives hard0[0][-1] = -1 fail = False try: - pos_neg_tuple = hard_sampler.gen_neg_pairs(g, pos_pairs) + pos_neg_tuple = hard_sampler.gen_neg_pairs(g, etype0, th.arange(10)) except: fail = True assert fail From 6a7a73a5916a859dd582cf7d68d8ff516f749be3 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Mon, 25 Dec 2023 22:25:04 -0800 Subject: [PATCH 12/17] Update --- python/graphstorm/dataloading/sampler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/graphstorm/dataloading/sampler.py b/python/graphstorm/dataloading/sampler.py index c8f457706d..a25170ce00 100644 --- a/python/graphstorm/dataloading/sampler.py +++ b/python/graphstorm/dataloading/sampler.py @@ -100,7 +100,8 @@ def __init__(self, k, dst_negative_field, negative_sampler, num_hard_negs=None): def _generate(self, g, eids, canonical_etype): """ _generate() is called by DGL BaseNegativeSampler to generate negative pairs. - See https://github.com/dmlc/dgl/blob/1.1.x/python/dgl/dataloading/negative_sampler.py#L7 For more detials + See https://github.com/dmlc/dgl/blob/1.1.x/python/dgl/dataloading/negative_sampler.py#L7 + For more detials """ if isinstance(self._dst_negative_field, str): dst_negative_field = self._dst_negative_field From 1fcc1a229ed37f0250f8159c3b15328daa7d1ad7 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Mon, 25 Dec 2023 23:23:40 -0800 Subject: [PATCH 13/17] Update --- python/graphstorm/dataloading/dataloading.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/python/graphstorm/dataloading/dataloading.py b/python/graphstorm/dataloading/dataloading.py index ff0c6861c5..fa22e5e9f4 100644 --- a/python/graphstorm/dataloading/dataloading.py +++ b/python/graphstorm/dataloading/dataloading.py @@ -904,7 +904,9 @@ def _prepare_dataloader(self, dataset, target_idxs, fanout, num_negative_edges, reverse_edge_types_map=None, edge_mask_for_gnn_embeddings=None, construct_feat_ntype=None, - construct_feat_fanout=5): + construct_feat_fanout=5, + edge_dst_negative_field=None, + num_hard_negs=None): g = dataset.g if construct_feat_ntype is None: construct_feat_ntype = [] @@ -921,6 +923,12 @@ def _prepare_dataloader(self, dataset, target_idxs, fanout, num_negative_edges, dataset, construct_feat_ntype, construct_feat_fanout) negative_sampler = self._prepare_negative_sampler(num_negative_edges) + if edge_dst_negative_field is not None: + negative_sampler = GSHardEdgeDstNegativeSampler(num_negative_edges, + edge_dst_negative_field, + negative_sampler, + num_hard_negs) + # edge loader if train_task: if isinstance(target_idxs, dict): From 186f1f64c20ceafac48ca6b970295a5eb46aee70 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Wed, 27 Dec 2023 21:43:39 -0800 Subject: [PATCH 14/17] Add end2end test --- python/graphstorm/config/argument.py | 4 +- python/graphstorm/gconstruct/transform.py | 2 +- python/graphstorm/model/edge_decoder.py | 19 +++-- tests/end2end-tests/create_data.sh | 9 +++ .../data_gen/movielens_lp_hard.json | 78 +++++++++++++++++++ .../data_gen/process_movielens.py | 21 +++++ .../end2end-tests/graphstorm-lp/mgpu_test.sh | 24 ++++++ 7 files changed, 149 insertions(+), 8 deletions(-) create mode 100644 tests/end2end-tests/data_gen/movielens_lp_hard.json diff --git a/python/graphstorm/config/argument.py b/python/graphstorm/config/argument.py index e6ebc95f79..17e8e92e96 100644 --- a/python/graphstorm/config/argument.py +++ b/python/graphstorm/config/argument.py @@ -1874,11 +1874,11 @@ def lp_edge_weight_for_loss(self): return None - def _get_predefined_negatives_per_etype(negatives): + def _get_predefined_negatives_per_etype(self, negatives): if len(negatives) == 1 and \ ":" not in negatives[0]: # global feat_name - return int(negatives[0]) + return negatives[0] # per edge type feature negative_dict = {} diff --git a/python/graphstorm/gconstruct/transform.py b/python/graphstorm/gconstruct/transform.py index 300922f027..0a2b82e622 100644 --- a/python/graphstorm/gconstruct/transform.py +++ b/python/graphstorm/gconstruct/transform.py @@ -1075,7 +1075,7 @@ class HardEdgeDstNegativeTransform(HardEdgeNegativeTransform): """ def set_target_etype(self, etype): - self._target_etype = etype + self._target_etype = tuple(etype) # target node type is destination node type. self._target_ntype = etype[2] diff --git a/python/graphstorm/model/edge_decoder.py b/python/graphstorm/model/edge_decoder.py index 427d15336c..641590e496 100644 --- a/python/graphstorm/model/edge_decoder.py +++ b/python/graphstorm/model/edge_decoder.py @@ -24,7 +24,8 @@ from .ngnn_mlp import NGNNMLP from .gs_layer import GSLayer, GSLayerNoParam from ..dataloading import (BUILTIN_LP_UNIFORM_NEG_SAMPLER, - BUILTIN_LP_JOINT_NEG_SAMPLER) + BUILTIN_LP_JOINT_NEG_SAMPLER, + BUILTIN_LP_FIXED_NEG_SAMPLER) from ..eval.utils import calc_distmult_pos_score, calc_dot_pos_score from ..eval.utils import calc_distmult_neg_head_score, calc_distmult_neg_tail_score @@ -628,7 +629,9 @@ def calc_test_scores(self, emb, pos_neg_tuple, neg_sample_type, device): neg_scores = [] if neg_src is not None: neg_src_emb = emb[utype][neg_src.reshape(-1,)].to(device) - if neg_sample_type == BUILTIN_LP_UNIFORM_NEG_SAMPLER: + if neg_sample_type in [BUILTIN_LP_UNIFORM_NEG_SAMPLER, + BUILTIN_LP_FIXED_NEG_SAMPLER]: + # fixed negative sample is similar to uniform negative sample neg_src_emb = neg_src_emb.reshape( neg_src.shape[0], neg_src.shape[1], -1) pos_dst_emb = pos_dst_emb.reshape( @@ -654,7 +657,9 @@ def calc_test_scores(self, emb, pos_neg_tuple, neg_sample_type, device): neg_scores.append(neg_score) if neg_dst is not None: - if neg_sample_type == BUILTIN_LP_UNIFORM_NEG_SAMPLER: + if neg_sample_type in [BUILTIN_LP_UNIFORM_NEG_SAMPLER, \ + BUILTIN_LP_FIXED_NEG_SAMPLER]: + # fixed negative sample is similar to uniform negative sample neg_dst_emb = emb[vtype][neg_dst.reshape(-1,)].to(device) neg_dst_emb = neg_dst_emb.reshape( neg_dst.shape[0], neg_dst.shape[1], -1) @@ -881,7 +886,9 @@ def calc_test_scores(self, emb, pos_neg_tuple, neg_sample_type, device): if neg_src is not None: neg_src_emb = emb[utype][neg_src.reshape(-1,)] - if neg_sample_type == BUILTIN_LP_UNIFORM_NEG_SAMPLER: + if neg_sample_type == [BUILTIN_LP_UNIFORM_NEG_SAMPLER, + BUILTIN_LP_FIXED_NEG_SAMPLER]: + # fixed negative sample is similar to uniform negative sample neg_src_emb = neg_src_emb.reshape(neg_src.shape[0], neg_src.shape[1], -1) # uniform sampled negative samples pos_dst_emb = pos_dst_emb.reshape( @@ -912,7 +919,9 @@ def calc_test_scores(self, emb, pos_neg_tuple, neg_sample_type, device): neg_scores.append(neg_score) if neg_dst is not None: - if neg_sample_type == BUILTIN_LP_UNIFORM_NEG_SAMPLER: + if neg_sample_type == [BUILTIN_LP_UNIFORM_NEG_SAMPLER, + BUILTIN_LP_FIXED_NEG_SAMPLER]: + # fixed negative sample is similar to uniform negative sample neg_dst_emb = emb[vtype][neg_dst.reshape(-1,)] neg_dst_emb = neg_dst_emb.reshape(neg_dst.shape[0], neg_dst.shape[1], -1) # uniform sampled negative samples diff --git a/tests/end2end-tests/create_data.sh b/tests/end2end-tests/create_data.sh index 830fe46ba8..d1a6a65eb5 100644 --- a/tests/end2end-tests/create_data.sh +++ b/tests/end2end-tests/create_data.sh @@ -41,6 +41,15 @@ python3 -m graphstorm.gconstruct.construct_graph \ --graph-name movie-lens-100k \ --add-reverse-edges +# movielens link prediction - hard negative and fixed negative for inference +rm -Rf /data/movielen_100k_lp_train_val_hard_neg_1p_4t +python3 -m graphstorm.gconstruct.construct_graph \ + --conf-file $GS_HOME/tests/end2end-tests/data_gen/movielens_lp_hard.json \ + --num-processes 1 \ + --output-dir movielen_100k_lp_train_val_hard_neg_1p_4t \ + --graph-name movie-lens-100k \ + --add-reverse-edges + # movielens link prediction removing test mask rm -Rf /data/movielen_100k_lp_train_no_test_1p_4t cp -R /data/movielen_100k_lp_train_val_1p_4t /data/movielen_100k_lp_train_no_test_1p_4t diff --git a/tests/end2end-tests/data_gen/movielens_lp_hard.json b/tests/end2end-tests/data_gen/movielens_lp_hard.json new file mode 100644 index 0000000000..9a0804c6c2 --- /dev/null +++ b/tests/end2end-tests/data_gen/movielens_lp_hard.json @@ -0,0 +1,78 @@ +{ + "version": "gconstruct-v0.1", + "nodes": [ + { + "node_id_col": "id", + "node_type": "user", + "format": {"name": "hdf5"}, + "files": "/data/ml-100k/user.hdf5", + "features": [ + { + "feature_col": "feat" + } + ] + }, + { + "node_id_col": "id", + "node_type": "movie", + "format": {"name": "parquet"}, + "files": "/data/ml-100k/movie.parquet", + "features": [ + { + "feature_col": "title", + "transform": { + "name": "bert_hf", + "bert_model": "bert-base-uncased", + "max_seq_length": 16 + } + } + ] + } + ], + "edges": [ + { + "source_id_col": "src_id", + "dest_id_col": "dst_id", + "relation": ["user", "rating", "movie"], + "format": {"name": "parquet"}, + "files": "/data/ml-100k/edges.parquet", + "labels": [ + { + "task_type": "link_prediction", + "split_pct": [0.1, 0.1, 0.1] + } + ], + "features":[ + { + "feature_col": "rate", + "feature_name": "rate" + } + ] + }, + { + "relation": ["user", "rating", "movie"], + "format": {"name": "parquet"}, + "files": "/data/ml-100k/hard_neg.parquet", + "features": [ + { + "feature_col": "hard_0", + "feature_name": "hard_0", + "transform": {"name": "edge_dst_hard_negative"} + }, + { + "feature_col": "hard_1", + "feature_name": "hard_1", + "transform": { + "name": "edge_dst_hard_negative", + "separator": "," + } + }, + { + "feature_col": "fixed_eval", + "feature_name": "fixed_eval", + "transform": {"name": "edge_dst_hard_negative"} + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/end2end-tests/data_gen/process_movielens.py b/tests/end2end-tests/data_gen/process_movielens.py index 90fdcd1702..e17934d45f 100644 --- a/tests/end2end-tests/data_gen/process_movielens.py +++ b/tests/end2end-tests/data_gen/process_movielens.py @@ -90,6 +90,27 @@ def write_data_parquet(data, data_file): edge_data = {'src_id': edges[0], 'dst_id': edges[1], 'rate': edges[2]} write_data_parquet(edge_data, '/data/ml-100k/edges.parquet') +# generate hard negatives +num_movies = len(ids) +neg_movie_idx = np.random.randint(0, num_movies, (edges.shape[0], 5)) +neg_movie_0 = ids[neg_movie_idx] +neg_movie_1 = [] +for idx, neg_movie in enumerate(neg_movie_0): + if idx < 10: + neg_movie_1.append(list(neg_movie.astype(str))[0]) + else: + neg_movie_1.append(",".join(list(neg_movie.astype(str)))) +neg_movie_1 = np.array(neg_movie_1) +neg_movie_idx = np.random.randint(0, num_movies, (edges.shape[0], 10)) +neg_movie_2 = ids[neg_movie_idx] + +neg_edge_data = { + "hard_0": neg_movie_0, + "hard_1": neg_movie_1, + "fixed_eval": neg_movie_2 +} +write_data_parquet(neg_edge_data, '/data/ml-100k/hard_neg.parquet') + # generate synthetic user data with label user_labels = np.random.randint(11, size=feat.shape[0]) user_data = {'id': user['id'].values, 'feat': feat, 'occupation': user['occupation'], 'label': user_labels} diff --git a/tests/end2end-tests/graphstorm-lp/mgpu_test.sh b/tests/end2end-tests/graphstorm-lp/mgpu_test.sh index 4ffd6bb261..dee7ea2038 100644 --- a/tests/end2end-tests/graphstorm-lp/mgpu_test.sh +++ b/tests/end2end-tests/graphstorm-lp/mgpu_test.sh @@ -525,4 +525,28 @@ python3 -m graphstorm.run.launch --workspace $GS_HOME/training_scripts/gsgnn_lp error_and_exit $? +echo "**************dataset: Movielens, RGCN layer 2, node feat: fixed HF BERT, BERT nodes: movie, inference: full-graph, negative_sampler: joint, exclude_training_targets: true, save model, enough hard neg" +python3 -m graphstorm.run.gs_link_prediction --workspace $GS_HOME/training_scripts/gsgnn_lp --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_hard_neg_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --eval-batch-size 1024 --exclude-training-targets True --reverse-edge-types-map user,rating,rating-rev,movie --save-model-path /data/gsgnn_lp_ml_hard_dot/ --save-model-frequency 1000 --train-etypes-negative-dstnode hard_0 --num-train-hard-negatives 4 --num-negative-edges 10 --target-etype user,rating,movie + +error_and_exit $? + +echo "**************dataset: Movielens, do inference on saved model, decoder: dot with fixed negative" +python3 -m graphstorm.run.gs_link_prediction --inference --workspace $GS_HOME/inference_scripts/lp_infer --num-trainers $NUM_INFO_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_hard_neg_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp_infer.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --eval-batch-size 1024 --restore-model-path /data/gsgnn_lp_ml_hard_dot/epoch-2/ --eval-etypes-negative-dstnode fixed_eval --eval-etype user,rating,movie + +error_and_exit $? + +rm -fr /data/gsgnn_lp_ml_hard_dot/* + +echo "**************dataset: Movielens, RGCN layer 2, node feat: fixed HF BERT, BERT nodes: movie, inference: full-graph, negative_sampler: joint, exclude_training_targets: true, save model, hard neg + random neg" +python3 -m graphstorm.run.gs_link_prediction --workspace $GS_HOME/training_scripts/gsgnn_lp --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_hard_neg_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --eval-batch-size 1024 --exclude-training-targets True --reverse-edge-types-map user,rating,rating-rev,movie --save-model-path /data/gsgnn_lp_ml_hard_dot/ --save-model-frequency 1000 --train-etypes-negative-dstnode user,rating,movie:hard_1 --num-train-hard-negatives 5 --num-negative-edges 10 --target-etype user,rating,movie + +error_and_exit $? + +echo "**************dataset: Movielens, do inference on saved model, decoder: dot with fixed negative" +python3 -m graphstorm.run.gs_link_prediction --inference --workspace $GS_HOME/inference_scripts/lp_infer --num-trainers $NUM_INFO_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_lp_train_val_hard_neg_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_lp_infer.yaml --fanout '10,15' --num-layers 2 --use-mini-batch-infer false --eval-batch-size 1024 --restore-model-path /data/gsgnn_lp_ml_hard_dot/epoch-2/ --eval-etypes-negative-dstnode user,rating,movie:fixed_eval --eval-etype user,rating,movie + +error_and_exit $? + +rm -fr /data/gsgnn_lp_ml_hard_dot/* + rm -fr /tmp/* From 9a44fd5386dee31c4a15b436c3e6dca9575a4030 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Wed, 27 Dec 2023 23:08:42 -0800 Subject: [PATCH 15/17] Fix lint --- .../graphstorm/run/gsgnn_lp/lp_infer_gnn.py | 34 +++++++++---------- python/graphstorm/run/gsgnn_lp/lp_infer_lm.py | 30 ++++++++-------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py b/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py index 3d6a854999..9ed19212c7 100644 --- a/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py +++ b/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py @@ -60,26 +60,26 @@ def main(config_args): infer.setup_task_tracker(tracker) # We only support full-graph inference for now. if config.eval_etypes_negative_dstnode is not None: - test_dataloader_cls = GSgnnLinkPredictionPredefinedTestDataLoader - elif config.eval_negative_sampler == BUILTIN_LP_UNIFORM_NEG_SAMPLER: - test_dataloader_cls = GSgnnLinkPredictionTestDataLoader - elif config.eval_negative_sampler == BUILTIN_LP_JOINT_NEG_SAMPLER: - test_dataloader_cls = GSgnnLinkPredictionJointTestDataLoader + # The negatives used in evaluation is fixed. + dataloader = GSgnnLinkPredictionPredefinedTestDataLoader( + infer_data, infer_data.test_idxs, + batch_size=config.eval_batch_size, + fixed_edge_dst_negative_field=config.eval_etypes_negative_dstnode, + fanout=config.eval_fanout) else: - raise ValueError('Unknown test negative sampler.' - 'Supported test negative samplers include ' - f'[{BUILTIN_LP_UNIFORM_NEG_SAMPLER}, {BUILTIN_LP_JOINT_NEG_SAMPLER}]') + if config.eval_negative_sampler == BUILTIN_LP_UNIFORM_NEG_SAMPLER: + test_dataloader_cls = GSgnnLinkPredictionTestDataLoader + elif config.eval_negative_sampler == BUILTIN_LP_JOINT_NEG_SAMPLER: + test_dataloader_cls = GSgnnLinkPredictionJointTestDataLoader + else: + raise ValueError('Unknown test negative sampler.' + 'Supported test negative samplers include ' + f'[{BUILTIN_LP_UNIFORM_NEG_SAMPLER}, {BUILTIN_LP_JOINT_NEG_SAMPLER}]') - if config.eval_etypes_negative_dstnode is not None: - dataloader = test_dataloader_cls(infer_data, infer_data.test_idxs, - batch_size=config.eval_batch_size, - fixed_edge_dst_negative_field=config.eval_etypes_negative_dstnode, - fanout=config.eval_fanout) - else: dataloader = test_dataloader_cls(infer_data, infer_data.test_idxs, - batch_size=config.eval_batch_size, - num_negative_edges=config.num_negative_edges_eval, - fanout=config.eval_fanout) + batch_size=config.eval_batch_size, + num_negative_edges=config.num_negative_edges_eval, + fanout=config.eval_fanout) infer.infer(infer_data, dataloader, save_embed_path=config.save_embed_path, edge_mask_for_gnn_embeddings=None if config.no_validation else \ diff --git a/python/graphstorm/run/gsgnn_lp/lp_infer_lm.py b/python/graphstorm/run/gsgnn_lp/lp_infer_lm.py index baec5082e6..bdf1becc7a 100644 --- a/python/graphstorm/run/gsgnn_lp/lp_infer_lm.py +++ b/python/graphstorm/run/gsgnn_lp/lp_infer_lm.py @@ -60,24 +60,24 @@ def main(config_args): infer.setup_task_tracker(tracker) # We only support full-graph inference for now. if config.eval_etypes_negative_dstnode is not None: - test_dataloader_cls = GSgnnLinkPredictionPredefinedTestDataLoader - elif config.eval_negative_sampler == BUILTIN_LP_UNIFORM_NEG_SAMPLER: - test_dataloader_cls = GSgnnLinkPredictionTestDataLoader - elif config.eval_negative_sampler == BUILTIN_LP_JOINT_NEG_SAMPLER: - test_dataloader_cls = GSgnnLinkPredictionJointTestDataLoader + # The negatives used in evaluation is fixed. + dataloader = GSgnnLinkPredictionPredefinedTestDataLoader( + infer_data, infer_data.test_idxs, + batch_size=config.eval_batch_size, + fixed_edge_dst_negative_field=config.eval_etypes_negative_dstnode) else: - raise ValueError('Unknown test negative sampler.' - 'Supported test negative samplers include ' - f'[{BUILTIN_LP_UNIFORM_NEG_SAMPLER}, {BUILTIN_LP_JOINT_NEG_SAMPLER}]') + if config.eval_negative_sampler == BUILTIN_LP_UNIFORM_NEG_SAMPLER: + test_dataloader_cls = GSgnnLinkPredictionTestDataLoader + elif config.eval_negative_sampler == BUILTIN_LP_JOINT_NEG_SAMPLER: + test_dataloader_cls = GSgnnLinkPredictionJointTestDataLoader + else: + raise ValueError('Unknown test negative sampler.' + 'Supported test negative samplers include ' + f'[{BUILTIN_LP_UNIFORM_NEG_SAMPLER}, {BUILTIN_LP_JOINT_NEG_SAMPLER}]') - if config.eval_etypes_negative_dstnode is not None: - dataloader = test_dataloader_cls(infer_data, infer_data.test_idxs, - batch_size=config.eval_batch_size, - fixed_edge_dst_negative_field=config.eval_etypes_negative_dstnode) - else: dataloader = test_dataloader_cls(infer_data, infer_data.test_idxs, - batch_size=config.eval_batch_size, - num_negative_edges=config.num_negative_edges_eval) + batch_size=config.eval_batch_size, + num_negative_edges=config.num_negative_edges_eval) # Preparing input layer for training or inference. # The input layer can pre-compute node features in the preparing step if needed. # For example pre-compute all BERT embeddings From b43bd921167295e73847c6d048709939a6f59d25 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Wed, 27 Dec 2023 23:40:01 -0800 Subject: [PATCH 16/17] Fix CI --- tests/unit-tests/test_dataloading.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit-tests/test_dataloading.py b/tests/unit-tests/test_dataloading.py index 076f49a884..ee8dc50fad 100644 --- a/tests/unit-tests/test_dataloading.py +++ b/tests/unit-tests/test_dataloading.py @@ -666,6 +666,7 @@ def test_GSgnnLinkPredictionTestDataLoader(batch_size, num_negative_edges): # after test pass, destroy all process group th.distributed.destroy_process_group() +@pytest.mark.parametrize("batch_size", [1, 10, 128]) def test_GSgnnLinkPredictionPredefinedTestDataLoader(batch_size): th.distributed.init_process_group(backend='gloo', init_method='tcp://127.0.0.1:23456', From be62993a4831533accd7233e3414d4768d1660c6 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Sat, 30 Dec 2023 12:59:09 -0800 Subject: [PATCH 17/17] Update --- python/graphstorm/model/edge_decoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/graphstorm/model/edge_decoder.py b/python/graphstorm/model/edge_decoder.py index 641590e496..b35b5f43c5 100644 --- a/python/graphstorm/model/edge_decoder.py +++ b/python/graphstorm/model/edge_decoder.py @@ -886,7 +886,7 @@ def calc_test_scores(self, emb, pos_neg_tuple, neg_sample_type, device): if neg_src is not None: neg_src_emb = emb[utype][neg_src.reshape(-1,)] - if neg_sample_type == [BUILTIN_LP_UNIFORM_NEG_SAMPLER, + if neg_sample_type in [BUILTIN_LP_UNIFORM_NEG_SAMPLER, BUILTIN_LP_FIXED_NEG_SAMPLER]: # fixed negative sample is similar to uniform negative sample neg_src_emb = neg_src_emb.reshape(neg_src.shape[0], neg_src.shape[1], -1) @@ -919,7 +919,7 @@ def calc_test_scores(self, emb, pos_neg_tuple, neg_sample_type, device): neg_scores.append(neg_score) if neg_dst is not None: - if neg_sample_type == [BUILTIN_LP_UNIFORM_NEG_SAMPLER, + if neg_sample_type in [BUILTIN_LP_UNIFORM_NEG_SAMPLER, BUILTIN_LP_FIXED_NEG_SAMPLER]: # fixed negative sample is similar to uniform negative sample neg_dst_emb = emb[vtype][neg_dst.reshape(-1,)]