[Feature] New ACM data and a notebook for data prepare (#691)

*Issue #, if available:* *Description of changes:* This PR include two changes: 1. Modify the ACM generation python code to include more real-like setting; 2. Add a new Notebook to introduce how to prepare the ACM data for other notebooks. By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. --------- Co-authored-by: Ubuntu <[email protected]> Co-authored-by: Ubuntu <[email protected]>
awslabs · Jan 10, 2024 · b57fbe1 · b57fbe1
1 parent e82cb8e
commit b57fbe1
Show file tree

Hide file tree

Showing 8 changed files with 1,435 additions and 6 deletions.
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,5 +1,7 @@
 sphinx==7.1.2
 sphinx-rtd-theme==1.3.0
+nbsphinx
+pandoc
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch==1.13.1+cpu
 -f https://data.dgl.ai/wheels-internal/repo.html

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -35,6 +35,7 @@
     "sphinx.ext.autosummary",
     "sphinx.ext.coverage",
     "sphinx.ext.mathjax",
+    "nbsphinx",
 ]
 templates_path = ['_templates']
 exclude_patterns = []

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -35,6 +35,14 @@ Welcome to the GraphStorm Documentation and Tutorials
    scale/distributed
    scale/sagemaker
 
+.. toctree::
+   :maxdepth: 1
+   :caption: Programming User Guide
+   :hidden:
+   :glob:
+
+   notebooks/Notebook_0_Data_Prepare
+
 .. toctree::
    :maxdepth: 1
    :caption: Advanced Topics

diff --git a/docs/source/notebooks/Notebook_0_Data_Prepare.ipynb b/docs/source/notebooks/Notebook_0_Data_Prepare.ipynb
diff --git a/docs/source/tutorials/own-data.rst b/docs/source/tutorials/own-data.rst
@@ -217,14 +217,18 @@ The above command reads in the JSON file, and matchs its contents with the node
 
     /tmp/acm_gs
     acm.json
+    author_id_remap.parquet
+    edge_label_stats.json
     edge_label_stats.json
     edge_mapping.pt
     node_label_stats.json
     node_mapping.pt
+    paper_id_remap.parquet
     |- part0
         edge_feat.dgl
         graph.dgl
         node_feat.dgl
+    subject_id_remap.parquet
 
 Because the above command specifies the ``--num-parts`` to be ``1``, there is only one partition created, which is saved in the ``part0`` folder. These files become the inputs of GraphStorm's launch scripts.
 

diff --git a/examples/acm_data.py b/examples/acm_data.py
@@ -67,6 +67,10 @@ def create_acm_raw_data(graph,
     This graph is based on the DGL graph created by the create_acm_dgl_graph() function. Because we
     only use three relationships in the original ACM data, the number of graph nodes could be less 
     than the papers, authors, and subjects in the original node lists.
+    
+    In addition, to demonstrate the use of string type node ids in the raw graph data, we add the 
+    first letter of each node type name to the original numerical ids, i.e., "author" -> "a", 
+    "paper" -> "p", and "subject" -> "s".
 
     Parameters
     ----------
@@ -88,12 +92,19 @@ def create_acm_raw_data(graph,
     # generate node dataframe: we use the graph node ids and node name as node_type
     node_list = []
 
+    # extract the first letter of each node type name as the prefix
+    node_prefix_dict = {}
+    for ntype in graph.ntypes:
+        node_prefix_dict[ntype] = ntype[0]
+
     for ntype in graph.ntypes:
         node_dict = {}
         # generate the id column
         node_ids = graph.nodes(ntype)
-        # convert tensor to list of arrays for saving in parquet format
-        node_dict['node_id'] = convert_tensor_to_list_arrays(node_ids)
+        # pad a prefix before each node id
+        str_node_ids = np.array([f'{node_prefix_dict[ntype]}{i}' for i in node_ids.numpy()])
+
+        node_dict['node_id'] = str_node_ids
 
         # generate the feature columns and label column
         if graph.nodes[ntype].data:
@@ -124,9 +135,11 @@ def create_acm_raw_data(graph,
         edge_dict = {}
         # generate the ids columns for both source nodes and destination nodes
         src_ids, dst_ids = graph.edges(etype=(src_ntype, etype, dst_ntype))
-       # convert tensor to list of arrays for saving in parquet format
-        edge_dict['source_id'] = convert_tensor_to_list_arrays(src_ids)
-        edge_dict['dest_id'] = convert_tensor_to_list_arrays(dst_ids)
+        # pad a prefix before each node id
+        str_src_ids = np.array([f'{node_prefix_dict[src_ntype]}{i}' for i in src_ids.numpy()])
+        str_dst_ids = np.array([f'{node_prefix_dict[dst_ntype]}{i}' for i in dst_ids.numpy()])
+        edge_dict['source_id'] = str_src_ids
+        edge_dict['dest_id'] = str_dst_ids
 
         # generate feature columns and label col
         if graph.edges[(src_ntype, etype, dst_ntype)].data:
@@ -190,6 +203,7 @@ def create_acm_raw_data(graph,
                 label_dict['label_col'] = col
                 label_dict['task_type'] = 'classification'
                 label_dict['split_pct'] = [0.8, 0.1, 0.1]
+                label_dict['label_stats_type'] = 'frequency_cnt'
                 labels_list.append(label_dict)
             elif col == 'text':
                 feat_dict['feature_col'] = col
@@ -231,7 +245,7 @@ def create_acm_raw_data(graph,
             elif col == 'dest_id':
                 edge_dict['dest_id_col'] = col
             elif col == 'label':
-                label_dict['task_type'] = 'link_prediction'      # In ACM data, we do not have this
+                label_dict['task_type'] = 'link_prediction'     # In ACM data, we do not have this
                                                                 # edge task. Here is just for demo
                 label_dict['split_pct'] = [0.8, 0.1, 0.1]       # Same as the label_split filed.
                                                                 # The split pct values are just for
@@ -252,6 +266,7 @@ def create_acm_raw_data(graph,
 
     # generate the configuration JSON file
     data_json = {}
+    data_json['version'] = 'gconstruct-v0.1'
     data_json['nodes'] = node_jsons
     data_json['edges'] = edge_jsons
 

diff --git a/tutorial/ACM_raw_parquet.png b/tutorial/ACM_raw_parquet.png
diff --git a/tutorial/ACM_w_text.png b/tutorial/ACM_w_text.png