diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
index 8843e61ad89..cc4ce474f2d 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
@@ -29,7 +29,7 @@ class GATConv(BaseConv):
 
     Parameters
     ----------
-    in_feats : int or tuple
+    in_feats : int or (int, int)
         Input feature size. A pair denotes feature sizes of source and
         destination nodes.
     out_feats : int
@@ -92,7 +92,7 @@ class GATConv(BaseConv):
 
     def __init__(
         self,
-        in_feats: Union[int, Tuple[int, int]],
+        in_feats: Union[int, tuple[int, int]],
         out_feats: int,
         num_heads: int,
         feat_drop: float = 0.0,
@@ -104,14 +104,19 @@ def __init__(
         bias: bool = True,
     ):
         super().__init__()
+
+        if isinstance(in_feats, int):
+            self.in_feats_src = self.in_feats_dst = in_feats
+        else:
+            self.in_feats_src, self.in_feats_dst = in_feats
         self.in_feats = in_feats
         self.out_feats = out_feats
-        self.in_feats_src, self.in_feats_dst = dgl.utils.expand_as_pair(in_feats)
         self.num_heads = num_heads
         self.feat_drop = nn.Dropout(feat_drop)
         self.concat = concat
         self.edge_feats = edge_feats
         self.negative_slope = negative_slope
+        self.residual = residual
         self.allow_zero_in_degree = allow_zero_in_degree
 
         if isinstance(in_feats, int):
@@ -126,28 +131,34 @@ def __init__(
 
         if edge_feats is not None:
             self.lin_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
-            self.attn_weights = nn.Parameter(torch.Tensor(3 * num_heads * out_feats))
+            self.attn_weights = nn.Parameter(torch.empty(3 * num_heads * out_feats))
         else:
             self.register_parameter("lin_edge", None)
-            self.attn_weights = nn.Parameter(torch.Tensor(2 * num_heads * out_feats))
+            self.attn_weights = nn.Parameter(torch.empty(2 * num_heads * out_feats))
 
-        if bias and concat:
-            self.bias = nn.Parameter(torch.Tensor(num_heads, out_feats))
-        elif bias and not concat:
-            self.bias = nn.Parameter(torch.Tensor(out_feats))
+        out_dim = num_heads * out_feats if concat else out_feats
+        if residual:
+            if self.in_feats_dst != out_dim:
+                self.lin_res = nn.Linear(self.in_feats_dst, out_dim, bias=bias)
+            else:
+                self.lin_res = nn.Identity()
         else:
-            self.register_buffer("bias", None)
+            self.register_buffer("lin_res", None)
 
-        self.residual = residual and self.in_feats_dst != out_feats * num_heads
-        if self.residual:
-            self.lin_res = nn.Linear(
-                self.in_feats_dst, num_heads * out_feats, bias=bias
-            )
+        if bias and not isinstance(self.lin_res, nn.Linear):
+            if concat:
+                self.bias = nn.Parameter(torch.empty(num_heads, out_feats))
+            else:
+                self.bias = nn.Parameter(torch.empty(out_feats))
         else:
-            self.register_buffer("lin_res", None)
+            self.register_buffer("bias", None)
 
         self.reset_parameters()
 
+    def set_allow_zero_in_degree(self, set_value):
+        r"""Set allow_zero_in_degree flag."""
+        self.allow_zero_in_degree = set_value
+
     def reset_parameters(self):
         r"""Reinitialize learnable parameters."""
         gain = nn.init.calculate_gain("relu")
@@ -172,7 +183,7 @@ def reset_parameters(self):
     def forward(
         self,
         g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         efeat: Optional[torch.Tensor] = None,
         max_in_degree: Optional[int] = None,
     ) -> torch.Tensor:
@@ -182,8 +193,10 @@ def forward(
         ----------
         graph : DGLGraph or SparseGraph
             The graph.
-        nfeat : torch.Tensor
-            Input features of shape :math:`(N, D_{in})`.
+        nfeat : torch.Tensor or (torch.Tensor, torch.Tensor)
+            Node features. If given as a tuple, the two elements correspond to
+            the source and destination node features, respectively, in a
+            bipartite graph.
         efeat: torch.Tensor, optional
             Optional edge features.
         max_in_degree : int
@@ -237,18 +250,17 @@ def forward(
 
         if bipartite:
             if not hasattr(self, "lin_src"):
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.in_feats must be a pair of "
-                    f"integers to allow bipartite node features, but got "
-                    f"{self.in_feats}."
-                )
-            nfeat_src = self.lin_src(nfeat[0])
-            nfeat_dst = self.lin_dst(nfeat[1])
+                nfeat_src = self.lin(nfeat[0])
+                nfeat_dst = self.lin(nfeat[1])
+            else:
+                nfeat_src = self.lin_src(nfeat[0])
+                nfeat_dst = self.lin_dst(nfeat[1])
         else:
             if not hasattr(self, "lin"):
                 raise RuntimeError(
                     f"{self.__class__.__name__}.in_feats is expected to be an "
-                    f"integer, but got {self.in_feats}."
+                    f"integer when the graph is not bipartite, "
+                    f"but got {self.in_feats}."
                 )
             nfeat = self.lin(nfeat)
 
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
index 209a5fe1a8d..6c78b4df0b8 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
@@ -29,14 +29,11 @@ class GATv2Conv(BaseConv):
 
     Parameters
     ----------
-    in_feats : int, or pair of ints
-        Input feature size; i.e, the number of dimensions of :math:`h_i^{(l)}`.
-        If the layer is to be applied to a unidirectional bipartite graph, `in_feats`
-        specifies the input feature size on both the source and destination nodes.
-        If a scalar is given, the source and destination node feature size
-        would take the same value.
+    in_feats : int or (int, int)
+        Input feature size. A pair denotes feature sizes of source and
+        destination nodes.
     out_feats : int
-        Output feature size; i.e, the number of dimensions of :math:`h_i^{(l+1)}`.
+        Output feature size.
     num_heads : int
         Number of heads in Multi-Head Attention.
     feat_drop : float, optional
@@ -58,17 +55,15 @@ class GATv2Conv(BaseConv):
         input graph. By setting ``True``, it will suppress the check and let the
         users handle it by themselves. Defaults: ``False``.
     bias : bool, optional
-        If set to :obj:`False`, the layer will not learn
-        an additive bias. (default: :obj:`True`)
+        If True, learns a bias term. Defaults: ``True``.
     share_weights : bool, optional
-        If set to :obj:`True`, the same matrix for :math:`W_{left}` and
-        :math:`W_{right}` in the above equations, will be applied to the source
-        and the target node of every edge. (default: :obj:`False`)
+        If ``True``, the same matrix will be applied to the source and the
+        destination node features. Defaults: ``False``.
     """
 
     def __init__(
         self,
-        in_feats: Union[int, Tuple[int, int]],
+        in_feats: Union[int, tuple[int, int]],
         out_feats: int,
         num_heads: int,
         feat_drop: float = 0.0,
@@ -81,16 +76,22 @@ def __init__(
         share_weights: bool = False,
     ):
         super().__init__()
+
+        if isinstance(in_feats, int):
+            self.in_feats_src = self.in_feats_dst = in_feats
+        else:
+            self.in_feats_src, self.in_feats_dst = in_feats
         self.in_feats = in_feats
         self.out_feats = out_feats
-        self.in_feats_src, self.in_feats_dst = dgl.utils.expand_as_pair(in_feats)
         self.num_heads = num_heads
         self.feat_drop = nn.Dropout(feat_drop)
         self.concat = concat
         self.edge_feats = edge_feats
         self.negative_slope = negative_slope
+        self.residual = residual
         self.allow_zero_in_degree = allow_zero_in_degree
         self.share_weights = share_weights
+        self.bias = bias
 
         self.lin_src = nn.Linear(self.in_feats_src, num_heads * out_feats, bias=bias)
         if share_weights:
@@ -106,30 +107,28 @@ def __init__(
                 self.in_feats_dst, num_heads * out_feats, bias=bias
             )
 
-        self.attn = nn.Parameter(torch.Tensor(num_heads * out_feats))
+        self.attn_weights = nn.Parameter(torch.empty(num_heads * out_feats))
 
         if edge_feats is not None:
             self.lin_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
         else:
             self.register_parameter("lin_edge", None)
 
-        if bias and concat:
-            self.bias = nn.Parameter(torch.Tensor(num_heads, out_feats))
-        elif bias and not concat:
-            self.bias = nn.Parameter(torch.Tensor(out_feats))
-        else:
-            self.register_buffer("bias", None)
-
-        self.residual = residual and self.in_feats_dst != out_feats * num_heads
-        if self.residual:
-            self.lin_res = nn.Linear(
-                self.in_feats_dst, num_heads * out_feats, bias=bias
-            )
+        out_dim = num_heads * out_feats if concat else out_feats
+        if residual:
+            if self.in_feats_dst != out_dim:
+                self.lin_res = nn.Linear(self.in_feats_dst, out_dim, bias=bias)
+            else:
+                self.lin_res = nn.Identity()
         else:
             self.register_buffer("lin_res", None)
 
         self.reset_parameters()
 
+    def set_allow_zero_in_degree(self, set_value):
+        r"""Set allow_zero_in_degree flag."""
+        self.allow_zero_in_degree = set_value
+
     def reset_parameters(self):
         r"""Reinitialize learnable parameters."""
         gain = nn.init.calculate_gain("relu")
@@ -137,7 +136,7 @@ def reset_parameters(self):
         nn.init.xavier_normal_(self.lin_dst.weight, gain=gain)
 
         nn.init.xavier_normal_(
-            self.attn.view(-1, self.num_heads, self.out_feats), gain=gain
+            self.attn_weights.view(-1, self.num_heads, self.out_feats), gain=gain
         )
         if self.lin_edge is not None:
             self.lin_edge.reset_parameters()
@@ -145,13 +144,10 @@ def reset_parameters(self):
         if self.lin_res is not None:
             self.lin_res.reset_parameters()
 
-        if self.bias is not None:
-            nn.init.zeros_(self.bias)
-
     def forward(
         self,
         g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         efeat: Optional[torch.Tensor] = None,
         max_in_degree: Optional[int] = None,
     ) -> torch.Tensor:
@@ -225,7 +221,7 @@ def forward(
 
         out = ops_torch.operators.mha_gat_v2_n2n(
             nfeat,
-            self.attn,
+            self.attn_weights,
             _graph,
             num_heads=self.num_heads,
             activation="LeakyReLU",
@@ -243,7 +239,4 @@ def forward(
                 res = res.mean(dim=1)
             out = out + res
 
-        if self.bias is not None:
-            out = out + self.bias
-
         return out
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
index 54916674210..5c4b5dea441 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -100,16 +100,16 @@ def __init__(
         self.self_loop = self_loop
         if regularizer is None:
             self.W = nn.Parameter(
-                torch.Tensor(num_rels + dim_self_loop, in_feats, out_feats)
+                torch.empty(num_rels + dim_self_loop, in_feats, out_feats)
             )
             self.coeff = None
         elif regularizer == "basis":
             if num_bases is None:
                 raise ValueError('Missing "num_bases" for basis regularization.')
             self.W = nn.Parameter(
-                torch.Tensor(num_bases + dim_self_loop, in_feats, out_feats)
+                torch.empty(num_bases + dim_self_loop, in_feats, out_feats)
             )
-            self.coeff = nn.Parameter(torch.Tensor(num_rels, num_bases))
+            self.coeff = nn.Parameter(torch.empty(num_rels, num_bases))
             self.num_bases = num_bases
         else:
             raise ValueError(
@@ -119,7 +119,7 @@ def __init__(
         self.regularizer = regularizer
 
         if bias:
-            self.bias = nn.Parameter(torch.Tensor(out_feats))
+            self.bias = nn.Parameter(torch.empty(out_feats))
         else:
             self.register_parameter("bias", None)
 
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
index a3f946d7cb4..b6198903766 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
@@ -65,7 +65,7 @@ class SAGEConv(BaseConv):
 
     def __init__(
         self,
-        in_feats: Union[int, Tuple[int, int]],
+        in_feats: Union[int, tuple[int, int]],
         out_feats: int,
         aggregator_type: str = "mean",
         feat_drop: float = 0.0,
@@ -111,7 +111,7 @@ def reset_parameters(self):
     def forward(
         self,
         g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        feat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        feat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         max_in_degree: Optional[int] = None,
     ) -> torch.Tensor:
         r"""Forward computation.
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
index 8481b9ee265..e77556fb76f 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
@@ -51,7 +51,7 @@ class TransformerConv(BaseConv):
 
     def __init__(
         self,
-        in_node_feats: Union[int, Tuple[int, int]],
+        in_node_feats: Union[int, tuple[int, int]],
         out_node_feats: int,
         num_heads: int,
         concat: bool = True,
@@ -116,7 +116,7 @@ def reset_parameters(self):
     def forward(
         self,
         g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         efeat: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward computation.
diff --git a/python/cugraph-dgl/tests/conftest.py b/python/cugraph-dgl/tests/conftest.py
index a3863ed81fa..ee1183f5cd1 100644
--- a/python/cugraph-dgl/tests/conftest.py
+++ b/python/cugraph-dgl/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,6 +13,7 @@
 
 import pytest
 
+import dgl
 import torch
 
 from cugraph.testing.mg_utils import (
@@ -58,3 +59,10 @@ class SparseGraphData1:
 @pytest.fixture
 def sparse_graph_1():
     return SparseGraphData1()
+
+
+@pytest.fixture
+def dgl_graph_1():
+    src = torch.tensor([0, 1, 0, 2, 3, 0, 4, 0, 5, 0, 6, 7, 0, 8, 9])
+    dst = torch.tensor([1, 9, 2, 9, 9, 4, 9, 5, 9, 6, 9, 9, 8, 9, 0])
+    return dgl.graph((src, dst))
diff --git a/python/cugraph-dgl/tests/nn/__init__.py b/python/cugraph-dgl/tests/nn/__init__.py
deleted file mode 100644
index a1dd01f33d4..00000000000
--- a/python/cugraph-dgl/tests/nn/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/cugraph-dgl/tests/nn/common.py b/python/cugraph-dgl/tests/nn/common.py
deleted file mode 100644
index 34787d20c9a..00000000000
--- a/python/cugraph-dgl/tests/nn/common.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from cugraph.utilities.utils import import_optional
-
-th = import_optional("torch")
-dgl = import_optional("dgl")
-
-
-def create_graph1():
-    u = th.tensor([0, 1, 0, 2, 3, 0, 4, 0, 5, 0, 6, 7, 0, 8, 9])
-    v = th.tensor([1, 9, 2, 9, 9, 4, 9, 5, 9, 6, 9, 9, 8, 9, 0])
-    g = dgl.graph((u, v))
-    return g
diff --git a/python/cugraph-dgl/tests/nn/test_gatconv.py b/python/cugraph-dgl/tests/nn/test_gatconv.py
index ce145b2bc87..de27efc6329 100644
--- a/python/cugraph-dgl/tests/nn/test_gatconv.py
+++ b/python/cugraph-dgl/tests/nn/test_gatconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,7 +15,6 @@
 
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import GATConv as CuGraphGATConv
-from .common import create_graph1
 
 dgl = pytest.importorskip("dgl", reason="DGL not available")
 torch = pytest.importorskip("torch", reason="PyTorch not available")
@@ -23,37 +22,49 @@
 ATOL = 1e-6
 
 
-@pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("mode", ["bipartite", "share_weights", "regular"])
+@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_heads", [1, 2, 7])
 @pytest.mark.parametrize("residual", [False, True])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_gatconv_equality(
-    bipartite, idtype_int, max_in_degree, num_heads, residual, to_block, sparse_format
+    dgl_graph_1,
+    mode,
+    idx_type,
+    max_in_degree,
+    num_heads,
+    residual,
+    to_block,
+    sparse_format,
 ):
     from dgl.nn.pytorch import GATConv
 
     torch.manual_seed(12345)
-    g = create_graph1().to("cuda")
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device).astype(idx_type)
 
-    if idtype_int:
-        g = g.int()
     if to_block:
         g = dgl.to_block(g)
 
     size = (g.num_src_nodes(), g.num_dst_nodes())
 
-    if bipartite:
+    if mode == "bipartite":
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+            torch.randn(size[0], in_feats[0]).to(device),
+            torch.randn(size[1], in_feats[1]).to(device),
+        )
+    elif mode == "share_weights":
+        in_feats = 5
+        nfeat = (
+            torch.randn(size[0], in_feats).to(device),
+            torch.randn(size[1], in_feats).to(device),
         )
     else:
-        in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+        in_feats = 7
+        nfeat = torch.randn(size[0], in_feats).to(device)
     out_feats = 2
 
     if sparse_format == "coo":
@@ -65,24 +76,24 @@ def test_gatconv_equality(
         sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
 
     args = (in_feats, out_feats, num_heads)
-    kwargs = {"bias": False, "allow_zero_in_degree": True}
+    kwargs = {"bias": False, "allow_zero_in_degree": True, "residual": residual}
 
-    conv1 = GATConv(*args, **kwargs).cuda()
-    out1 = conv1(g, nfeat)
+    conv1 = GATConv(*args, **kwargs).to(device)
+    conv2 = CuGraphGATConv(*args, **kwargs).to(device)
 
-    conv2 = CuGraphGATConv(*args, **kwargs).cuda()
     dim = num_heads * out_feats
     with torch.no_grad():
-        conv2.attn_weights.data[:dim] = conv1.attn_l.data.flatten()
-        conv2.attn_weights.data[dim:] = conv1.attn_r.data.flatten()
-        if bipartite:
-            conv2.lin_src.weight.data = conv1.fc_src.weight.data.detach().clone()
-            conv2.lin_dst.weight.data = conv1.fc_dst.weight.data.detach().clone()
+        conv2.attn_weights[:dim].copy_(conv1.attn_l.flatten())
+        conv2.attn_weights[dim:].copy_(conv1.attn_r.flatten())
+        if mode == "bipartite":
+            conv2.lin_src.weight.copy_(conv1.fc_src.weight)
+            conv2.lin_dst.weight.copy_(conv1.fc_dst.weight)
         else:
-            conv2.lin.weight.data = conv1.fc.weight.data.detach().clone()
-        if residual and conv2.residual:
-            conv2.lin_res.weight.data = conv1.fc_res.weight.data.detach().clone()
+            conv2.lin.weight.copy_(conv1.fc.weight)
+        if residual and conv1.has_linear_res:
+            conv2.lin_res.weight.copy_(conv1.res_fc.weight)
 
+    out1 = conv1(g, nfeat)
     if sparse_format is not None:
         out2 = conv2(sg, nfeat, max_in_degree=max_in_degree)
     else:
@@ -90,12 +101,12 @@ def test_gatconv_equality(
 
     assert torch.allclose(out1, out2, atol=ATOL)
 
-    grad_out1 = torch.rand_like(out1)
-    grad_out2 = grad_out1.clone().detach()
+    grad_out1 = torch.randn_like(out1)
+    grad_out2 = grad_out1.detach().clone()
     out1.backward(grad_out1)
     out2.backward(grad_out2)
 
-    if bipartite:
+    if mode == "bipartite":
         assert torch.allclose(
             conv1.fc_src.weight.grad, conv2.lin_src.weight.grad, atol=ATOL
         )
@@ -105,25 +116,38 @@ def test_gatconv_equality(
     else:
         assert torch.allclose(conv1.fc.weight.grad, conv2.lin.weight.grad, atol=ATOL)
 
+    if residual and conv1.has_linear_res:
+        assert torch.allclose(
+            conv1.res_fc.weight.grad, conv2.lin_res.weight.grad, atol=ATOL
+        )
+
     assert torch.allclose(
         torch.cat((conv1.attn_l.grad, conv1.attn_r.grad), dim=0),
         conv2.attn_weights.grad.view(2, num_heads, out_feats),
-        atol=ATOL,
+        atol=1e-5,  # Note: using a loosened tolerance here due to numerical error
     )
 
 
 @pytest.mark.parametrize("bias", [False, True])
 @pytest.mark.parametrize("bipartite", [False, True])
 @pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("max_in_degree", [None, 8, 800])
+@pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_heads", [1, 2, 7])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("use_edge_feats", [False, True])
 def test_gatconv_edge_feats(
-    bias, bipartite, concat, max_in_degree, num_heads, to_block, use_edge_feats
+    dgl_graph_1,
+    bias,
+    bipartite,
+    concat,
+    max_in_degree,
+    num_heads,
+    to_block,
+    use_edge_feats,
 ):
     torch.manual_seed(12345)
-    g = create_graph1().to("cuda")
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device)
 
     if to_block:
         g = dgl.to_block(g)
@@ -131,17 +155,17 @@ def test_gatconv_edge_feats(
     if bipartite:
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+            torch.rand(g.num_src_nodes(), in_feats[0]).to(device),
+            torch.rand(g.num_dst_nodes(), in_feats[1]).to(device),
         )
     else:
         in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+        nfeat = torch.rand(g.num_src_nodes(), in_feats).to(device)
     out_feats = 2
 
     if use_edge_feats:
         edge_feats = 3
-        efeat = torch.rand(g.num_edges(), edge_feats).cuda()
+        efeat = torch.rand(g.num_edges(), edge_feats).to(device)
     else:
         edge_feats = None
         efeat = None
@@ -154,8 +178,8 @@ def test_gatconv_edge_feats(
         edge_feats=edge_feats,
         bias=bias,
         allow_zero_in_degree=True,
-    ).cuda()
+    ).to(device)
     out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
 
-    grad_out = torch.rand_like(out)
+    grad_out = torch.randn_like(out)
     out.backward(grad_out)
diff --git a/python/cugraph-dgl/tests/nn/test_gatv2conv.py b/python/cugraph-dgl/tests/nn/test_gatv2conv.py
index 52003edacca..2d26b7fdc28 100644
--- a/python/cugraph-dgl/tests/nn/test_gatv2conv.py
+++ b/python/cugraph-dgl/tests/nn/test_gatv2conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,45 +15,56 @@
 
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import GATv2Conv as CuGraphGATv2Conv
-from .common import create_graph1
 
 dgl = pytest.importorskip("dgl", reason="DGL not available")
 torch = pytest.importorskip("torch", reason="PyTorch not available")
 
-ATOL = 1e-6
+ATOL = 1e-5
 
 
-@pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("mode", ["bipartite", "share_weights", "regular"])
+@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_heads", [1, 2, 7])
 @pytest.mark.parametrize("residual", [False, True])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_gatv2conv_equality(
-    bipartite, idtype_int, max_in_degree, num_heads, residual, to_block, sparse_format
+    dgl_graph_1,
+    mode,
+    idx_type,
+    max_in_degree,
+    num_heads,
+    residual,
+    to_block,
+    sparse_format,
 ):
     from dgl.nn.pytorch import GATv2Conv
 
     torch.manual_seed(12345)
-    g = create_graph1().to("cuda")
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device).astype(idx_type)
 
-    if idtype_int:
-        g = g.int()
     if to_block:
         g = dgl.to_block(g)
 
     size = (g.num_src_nodes(), g.num_dst_nodes())
 
-    if bipartite:
+    if mode == "bipartite":
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+            torch.randn(size[0], in_feats[0]).to(device),
+            torch.randn(size[1], in_feats[1]).to(device),
+        )
+    elif mode == "share_weights":
+        in_feats = 5
+        nfeat = (
+            torch.randn(size[0], in_feats).to(device),
+            torch.randn(size[1], in_feats).to(device),
         )
     else:
-        in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+        in_feats = 7
+        nfeat = torch.randn(size[0], in_feats).to(device)
     out_feats = 2
 
     if sparse_format == "coo":
@@ -65,19 +76,24 @@ def test_gatv2conv_equality(
         sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
 
     args = (in_feats, out_feats, num_heads)
-    kwargs = {"bias": False, "allow_zero_in_degree": True}
+    kwargs = {
+        "bias": False,
+        "allow_zero_in_degree": True,
+        "residual": residual,
+        "share_weights": mode == "share_weights",
+    }
 
-    conv1 = GATv2Conv(*args, **kwargs).cuda()
-    out1 = conv1(g, nfeat)
+    conv1 = GATv2Conv(*args, **kwargs).to(device)
+    conv2 = CuGraphGATv2Conv(*args, **kwargs).to(device)
 
-    conv2 = CuGraphGATv2Conv(*args, **kwargs).cuda()
     with torch.no_grad():
-        conv2.attn.data = conv1.attn.data.flatten()
-        conv2.lin_src.weight.data = conv1.fc_src.weight.data.detach().clone()
-        conv2.lin_dst.weight.data = conv1.fc_dst.weight.data.detach().clone()
-        if residual and conv2.residual:
-            conv2.lin_res.weight.data = conv1.fc_res.weight.data.detach().clone()
+        conv2.attn_weights.copy_(conv1.attn.flatten())
+        conv2.lin_src.weight.copy_(conv1.fc_src.weight)
+        conv2.lin_dst.weight.copy_(conv1.fc_dst.weight)
+        if residual:
+            conv2.lin_res.weight.copy_(conv1.res_fc.weight)
 
+    out1 = conv1(g, nfeat)
     if sparse_format is not None:
         out2 = conv2(sg, nfeat, max_in_degree=max_in_degree)
     else:
@@ -85,8 +101,8 @@ def test_gatv2conv_equality(
 
     assert torch.allclose(out1, out2, atol=ATOL)
 
-    grad_out1 = torch.rand_like(out1)
-    grad_out2 = grad_out1.clone().detach()
+    grad_out1 = torch.randn_like(out1)
+    grad_out2 = grad_out1.detach().clone()
     out1.backward(grad_out1)
     out2.backward(grad_out2)
 
@@ -97,21 +113,38 @@ def test_gatv2conv_equality(
         conv1.fc_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
     )
 
-    assert torch.allclose(conv1.attn.grad, conv1.attn.grad, atol=ATOL)
+    if residual:
+        assert torch.allclose(
+            conv1.res_fc.weight.grad, conv2.lin_res.weight.grad, atol=ATOL
+        )
+
+    assert torch.allclose(
+        conv1.attn.grad,
+        conv2.attn_weights.grad.view(1, num_heads, out_feats),
+        atol=ATOL,
+    )
 
 
 @pytest.mark.parametrize("bias", [False, True])
 @pytest.mark.parametrize("bipartite", [False, True])
 @pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("max_in_degree", [None, 8, 800])
+@pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_heads", [1, 2, 7])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("use_edge_feats", [False, True])
 def test_gatv2conv_edge_feats(
-    bias, bipartite, concat, max_in_degree, num_heads, to_block, use_edge_feats
+    dgl_graph_1,
+    bias,
+    bipartite,
+    concat,
+    max_in_degree,
+    num_heads,
+    to_block,
+    use_edge_feats,
 ):
     torch.manual_seed(12345)
-    g = create_graph1().to("cuda")
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device)
 
     if to_block:
         g = dgl.to_block(g)
@@ -119,17 +152,17 @@ def test_gatv2conv_edge_feats(
     if bipartite:
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+            torch.rand(g.num_src_nodes(), in_feats[0]).to(device),
+            torch.rand(g.num_dst_nodes(), in_feats[1]).to(device),
         )
     else:
         in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+        nfeat = torch.rand(g.num_src_nodes(), in_feats).to(device)
     out_feats = 2
 
     if use_edge_feats:
         edge_feats = 3
-        efeat = torch.rand(g.num_edges(), edge_feats).cuda()
+        efeat = torch.rand(g.num_edges(), edge_feats).to(device)
     else:
         edge_feats = None
         efeat = None
@@ -142,8 +175,8 @@ def test_gatv2conv_edge_feats(
         edge_feats=edge_feats,
         bias=bias,
         allow_zero_in_degree=True,
-    ).cuda()
+    ).to(device)
     out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
 
-    grad_out = torch.rand_like(out)
+    grad_out = torch.randn_like(out)
     out.backward(grad_out)
diff --git a/python/cugraph-dgl/tests/nn/test_relgraphconv.py b/python/cugraph-dgl/tests/nn/test_relgraphconv.py
index bdaa89e57f2..b5d3686c609 100644
--- a/python/cugraph-dgl/tests/nn/test_relgraphconv.py
+++ b/python/cugraph-dgl/tests/nn/test_relgraphconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,7 +15,6 @@
 
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import RelGraphConv as CuGraphRelGraphConv
-from .common import create_graph1
 
 dgl = pytest.importorskip("dgl", reason="DGL not available")
 torch = pytest.importorskip("torch", reason="PyTorch not available")
@@ -23,7 +22,7 @@
 ATOL = 1e-6
 
 
-@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_bases", [1, 2, 5])
 @pytest.mark.parametrize("regularizer", [None, "basis"])
@@ -31,7 +30,8 @@
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_relgraphconv_equality(
-    idtype_int,
+    dgl_graph_1,
+    idx_type,
     max_in_degree,
     num_bases,
     regularizer,
@@ -42,6 +42,12 @@ def test_relgraphconv_equality(
     from dgl.nn.pytorch import RelGraphConv
 
     torch.manual_seed(12345)
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device).astype(idx_type)
+
+    if to_block:
+        g = dgl.to_block(g)
+
     in_feat, out_feat, num_rels = 10, 2, 3
     args = (in_feat, out_feat, num_rels)
     kwargs = {
@@ -50,16 +56,10 @@ def test_relgraphconv_equality(
         "bias": False,
         "self_loop": self_loop,
     }
-    g = create_graph1().to("cuda")
-    g.edata[dgl.ETYPE] = torch.randint(num_rels, (g.num_edges(),)).cuda()
-
-    if idtype_int:
-        g = g.int()
-    if to_block:
-        g = dgl.to_block(g)
 
+    g.edata[dgl.ETYPE] = torch.randint(num_rels, (g.num_edges(),)).to(device)
     size = (g.num_src_nodes(), g.num_dst_nodes())
-    feat = torch.rand(g.num_src_nodes(), in_feat).cuda()
+    feat = torch.rand(g.num_src_nodes(), in_feat).to(device)
 
     if sparse_format == "coo":
         sg = SparseGraph(
@@ -76,18 +76,18 @@ def test_relgraphconv_equality(
             size=size, src_ids=indices, cdst_ids=offsets, values=etypes, formats="csc"
         )
 
-    conv1 = RelGraphConv(*args, **kwargs).cuda()
-    conv2 = CuGraphRelGraphConv(*args, **kwargs, apply_norm=False).cuda()
+    conv1 = RelGraphConv(*args, **kwargs).to(device)
+    conv2 = CuGraphRelGraphConv(*args, **kwargs, apply_norm=False).to(device)
 
     with torch.no_grad():
         if self_loop:
-            conv2.W.data[:-1] = conv1.linear_r.W.data
-            conv2.W.data[-1] = conv1.loop_weight.data
+            conv2.W[:-1].copy_(conv1.linear_r.W)
+            conv2.W[-1].copy_(conv1.loop_weight)
         else:
-            conv2.W.data = conv1.linear_r.W.data.detach().clone()
+            conv2.W.copy_(conv1.linear_r.W)
 
         if regularizer is not None:
-            conv2.coeff.data = conv1.linear_r.coeff.data.detach().clone()
+            conv2.coeff.copy_(conv1.linear_r.coeff)
 
     out1 = conv1(g, feat, g.edata[dgl.ETYPE])
 
@@ -98,7 +98,7 @@ def test_relgraphconv_equality(
 
     assert torch.allclose(out1, out2, atol=ATOL)
 
-    grad_out = torch.rand_like(out1)
+    grad_out = torch.randn_like(out1)
     out1.backward(grad_out)
     out2.backward(grad_out)
 
diff --git a/python/cugraph-dgl/tests/nn/test_sageconv.py b/python/cugraph-dgl/tests/nn/test_sageconv.py
index b5d0a44b868..3f1c2b1b3fe 100644
--- a/python/cugraph-dgl/tests/nn/test_sageconv.py
+++ b/python/cugraph-dgl/tests/nn/test_sageconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,7 +15,6 @@
 
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import SAGEConv as CuGraphSAGEConv
-from .common import create_graph1
 
 dgl = pytest.importorskip("dgl", reason="DGL not available")
 torch = pytest.importorskip("torch", reason="PyTorch not available")
@@ -26,21 +25,19 @@
 @pytest.mark.parametrize("aggr", ["mean", "pool"])
 @pytest.mark.parametrize("bias", [False, True])
 @pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_sageconv_equality(
-    aggr, bias, bipartite, idtype_int, max_in_degree, to_block, sparse_format
+    dgl_graph_1, aggr, bias, bipartite, idx_type, max_in_degree, to_block, sparse_format
 ):
     from dgl.nn.pytorch import SAGEConv
 
     torch.manual_seed(12345)
-    kwargs = {"aggregator_type": aggr, "bias": bias}
-    g = create_graph1().to("cuda")
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device).astype(idx_type)
 
-    if idtype_int:
-        g = g.int()
     if to_block:
         g = dgl.to_block(g)
 
@@ -49,12 +46,12 @@ def test_sageconv_equality(
     if bipartite:
         in_feats = (5, 3)
         feat = (
-            torch.rand(size[0], in_feats[0], requires_grad=True).cuda(),
-            torch.rand(size[1], in_feats[1], requires_grad=True).cuda(),
+            torch.rand(size[0], in_feats[0], requires_grad=True).to(device),
+            torch.rand(size[1], in_feats[1], requires_grad=True).to(device),
         )
     else:
         in_feats = 5
-        feat = torch.rand(size[0], in_feats).cuda()
+        feat = torch.rand(size[0], in_feats).to(device)
     out_feats = 2
 
     if sparse_format == "coo":
@@ -65,18 +62,19 @@ def test_sageconv_equality(
         offsets, indices, _ = g.adj_tensors("csc")
         sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
 
-    conv1 = SAGEConv(in_feats, out_feats, **kwargs).cuda()
-    conv2 = CuGraphSAGEConv(in_feats, out_feats, **kwargs).cuda()
+    kwargs = {"aggregator_type": aggr, "bias": bias}
+    conv1 = SAGEConv(in_feats, out_feats, **kwargs).to(device)
+    conv2 = CuGraphSAGEConv(in_feats, out_feats, **kwargs).to(device)
 
     in_feats_src = conv2.in_feats_src
     with torch.no_grad():
-        conv2.lin.weight.data[:, :in_feats_src] = conv1.fc_neigh.weight.data
-        conv2.lin.weight.data[:, in_feats_src:] = conv1.fc_self.weight.data
+        conv2.lin.weight[:, :in_feats_src].copy_(conv1.fc_neigh.weight)
+        conv2.lin.weight[:, in_feats_src:].copy_(conv1.fc_self.weight)
         if bias:
-            conv2.lin.bias.data[:] = conv1.fc_self.bias.data
+            conv2.lin.bias.copy_(conv1.fc_self.bias)
         if aggr == "pool":
-            conv2.pre_lin.weight.data[:] = conv1.fc_pool.weight.data
-            conv2.pre_lin.bias.data[:] = conv1.fc_pool.bias.data
+            conv2.pre_lin.weight.copy_(conv1.fc_pool.weight)
+            conv2.pre_lin.bias.copy_(conv1.fc_pool.bias)
 
     out1 = conv1(g, feat)
     if sparse_format is not None:
@@ -85,7 +83,7 @@ def test_sageconv_equality(
         out2 = conv2(g, feat, max_in_degree=max_in_degree)
     assert torch.allclose(out1, out2, atol=ATOL)
 
-    grad_out = torch.rand_like(out1)
+    grad_out = torch.randn_like(out1)
     out1.backward(grad_out)
     out2.backward(grad_out)
     assert torch.allclose(
diff --git a/python/cugraph-dgl/tests/nn/test_transformerconv.py b/python/cugraph-dgl/tests/nn/test_transformerconv.py
index 5ac4fd7bea7..28d13dedec8 100644
--- a/python/cugraph-dgl/tests/nn/test_transformerconv.py
+++ b/python/cugraph-dgl/tests/nn/test_transformerconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,7 +15,6 @@
 
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import TransformerConv
-from .common import create_graph1
 
 dgl = pytest.importorskip("dgl", reason="DGL not available")
 torch = pytest.importorskip("torch", reason="PyTorch not available")
@@ -26,27 +25,25 @@
 @pytest.mark.parametrize("beta", [False, True])
 @pytest.mark.parametrize("bipartite_node_feats", [False, True])
 @pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("idtype_int", [False, True])
-@pytest.mark.parametrize("num_heads", [1, 2, 3, 4])
+@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
+@pytest.mark.parametrize("num_heads", [1, 3, 4])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("use_edge_feats", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_transformerconv(
+    dgl_graph_1,
     beta,
     bipartite_node_feats,
     concat,
-    idtype_int,
+    idx_type,
     num_heads,
     to_block,
     use_edge_feats,
     sparse_format,
 ):
     torch.manual_seed(12345)
-    device = "cuda"
-    g = create_graph1().to(device)
-
-    if idtype_int:
-        g = g.int()
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device).astype(idx_type)
 
     if to_block:
         g = dgl.to_block(g)
@@ -92,5 +89,5 @@ def test_transformerconv(
     else:
         out = conv(g, nfeat, efeat)
 
-    grad_out = torch.rand_like(out)
+    grad_out = torch.randn_like(out)
     out.backward(grad_out)