diff --git a/docs/source/notebooks/Notebook_0_Data_Prepare.ipynb b/docs/source/notebooks/Notebook_0_Data_Prepare.ipynb index 4f32e48d91..ecb7b35e80 100644 --- a/docs/source/notebooks/Notebook_0_Data_Prepare.ipynb +++ b/docs/source/notebooks/Notebook_0_Data_Prepare.ipynb @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 25, "id": "609b0869-b4eb-4467-8353-31e2acc92203", "metadata": {}, "outputs": [], @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "4e3e6c4e-9d6d-4a6d-9c51-cf856eef06fc", "metadata": {}, "outputs": [], @@ -86,7 +86,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 27, "id": "2b8c752c-b6e8-4562-9637-4c56a4b09875", "metadata": {}, "outputs": [], @@ -109,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 28, "id": "fc5eb53c-50ed-4b51-8b94-57094ee4dc15", "metadata": {}, "outputs": [], @@ -127,7 +127,7 @@ "metadata": {}, "source": [ "## Data Exploration and Explanation\n", - "The above commands created two sets of ACM data, i.e., the raw ACM data tables, and ACM GraphStorm input graphs. Below we explore these datasets, and explain their format so that users can prepare their own graph data easily." + "The above commands created two sets of ACM data, i.e., the **raw ACM data tables**, and ACM **GraphStorm input graphs**. Below we explore these datasets, and explain their format so that users can prepare their own graph data easily." ] }, { @@ -150,11 +150,11 @@ "output_type": "stream", "text": [ "total 24\n", - "drwxrwxr-x 4 ubuntu ubuntu 4096 Dec 19 21:27 .\n", - "drwxrwxr-x 6 ubuntu ubuntu 4096 Dec 19 21:29 ..\n", - "-rw-rw-r-- 1 ubuntu ubuntu 5306 Dec 19 21:27 config.json\n", - "drwxrwxr-x 2 ubuntu ubuntu 4096 Dec 19 21:27 edges\n", - "drwxrwxr-x 2 ubuntu ubuntu 4096 Dec 19 21:27 nodes\n" + "drwxrwxr-x 4 ubuntu ubuntu 4096 May 15 23:29 .\n", + "drwxrwxr-x 6 ubuntu ubuntu 4096 May 15 23:30 ..\n", + "-rw-rw-r-- 1 ubuntu ubuntu 5306 May 15 23:29 config.json\n", + "drwxrwxr-x 2 ubuntu ubuntu 4096 May 15 23:29 edges\n", + "drwxrwxr-x 2 ubuntu ubuntu 4096 May 15 23:29 nodes\n" ] } ], @@ -173,11 +173,11 @@ "output_type": "stream", "text": [ "total 38744\n", - "drwxrwxr-x 2 ubuntu ubuntu 4096 Dec 19 21:27 .\n", - "drwxrwxr-x 4 ubuntu ubuntu 4096 Dec 19 21:27 ..\n", - "-rw-rw-r-- 1 ubuntu ubuntu 18843566 Dec 19 21:27 author.parquet\n", - "-rw-rw-r-- 1 ubuntu ubuntu 20704514 Dec 19 21:27 paper.parquet\n", - "-rw-rw-r-- 1 ubuntu ubuntu 113462 Dec 19 21:27 subject.parquet\n" + "drwxrwxr-x 2 ubuntu ubuntu 4096 May 15 23:29 .\n", + "drwxrwxr-x 4 ubuntu ubuntu 4096 May 15 23:29 ..\n", + "-rw-rw-r-- 1 ubuntu ubuntu 18843828 May 15 23:29 author.parquet\n", + "-rw-rw-r-- 1 ubuntu ubuntu 20702289 May 15 23:29 paper.parquet\n", + "-rw-rw-r-- 1 ubuntu ubuntu 113414 May 15 23:29 subject.parquet\n" ] } ], @@ -196,14 +196,14 @@ "output_type": "stream", "text": [ "total 1016\n", - "drwxrwxr-x 2 ubuntu ubuntu 4096 Dec 19 21:27 .\n", - "drwxrwxr-x 4 ubuntu ubuntu 4096 Dec 19 21:27 ..\n", - "-rw-rw-r-- 1 ubuntu ubuntu 263138 Dec 19 21:27 author_writing_paper.parquet\n", - "-rw-rw-r-- 1 ubuntu ubuntu 156358 Dec 19 21:27 paper_cited_paper.parquet\n", - "-rw-rw-r-- 1 ubuntu ubuntu 162714 Dec 19 21:27 paper_citing_paper.parquet\n", - "-rw-rw-r-- 1 ubuntu ubuntu 87792 Dec 19 21:27 paper_is-about_subject.parquet\n", - "-rw-rw-r-- 1 ubuntu ubuntu 265948 Dec 19 21:27 paper_written-by_author.parquet\n", - "-rw-rw-r-- 1 ubuntu ubuntu 84005 Dec 19 21:27 subject_has_paper.parquet\n" + "drwxrwxr-x 2 ubuntu ubuntu 4096 May 15 23:29 .\n", + "drwxrwxr-x 4 ubuntu ubuntu 4096 May 15 23:29 ..\n", + "-rw-rw-r-- 1 ubuntu ubuntu 263138 May 15 23:29 author_writing_paper.parquet\n", + "-rw-rw-r-- 1 ubuntu ubuntu 156358 May 15 23:29 paper_cited_paper.parquet\n", + "-rw-rw-r-- 1 ubuntu ubuntu 162714 May 15 23:29 paper_citing_paper.parquet\n", + "-rw-rw-r-- 1 ubuntu ubuntu 87792 May 15 23:29 paper_is-about_subject.parquet\n", + "-rw-rw-r-- 1 ubuntu ubuntu 265948 May 15 23:29 paper_written-by_author.parquet\n", + "-rw-rw-r-- 1 ubuntu ubuntu 84005 May 15 23:29 subject_has_paper.parquet\n" ] } ], @@ -237,7 +237,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 29, "id": "ddfef233-7fff-41ab-84f7-30d96885c472", "metadata": { "scrolled": true @@ -323,32 +323,32 @@ " \n", " \n", " \n", - " 4011\n", - " p4011\n", - " 4\n", - " [0.012342263, -0.01471429, -0.012913096, 0.007...\n", - " 'User behavior driven ranking without editoria...\n", + " 6933\n", + " p6933\n", + " 3\n", + " [-0.006179405, 0.010796122, -0.018994818, -0.0...\n", + " 'User-oriented text segmentation evaluation me...\n", " \n", " \n", - " 11379\n", - " p11379\n", - " 12\n", - " [-0.012718345, 0.020719944, -0.010691697, 0.00...\n", - " 'Reducing truth-telling online mechanisms to o...\n", + " 743\n", + " p743\n", + " 4\n", + " [-0.016835907, -0.020954693, 0.009945098, -0.0...\n", + " 'Similarity-aware indexing for real-time entit...\n", " \n", " \n", - " 9401\n", - " p9401\n", - " 8\n", - " [-0.013923097, 0.017362924, -0.009770028, -0.0...\n", - " 'The lazy adversary conjecture fails We prove ...\n", + " 11497\n", + " p11497\n", + " 12\n", + " [0.009553924, 0.019706111, 0.013354154, -0.010...\n", + " 'Polynomial time algorithm for computing the t...\n", " \n", " \n", - " 4928\n", - " p4928\n", - " 1\n", - " [0.019353714, 0.0066366955, 0.0115322415, 0.01...\n", - " 'Privacy preserving schema and data matching ...\n", + " 2588\n", + " p2588\n", + " 2\n", + " [0.0036002623, -0.007723761, -0.012699484, -0....\n", + " 'Microformats: a pragmatic path to the semanti...\n", " \n", " \n", "\n", @@ -356,16 +356,16 @@ ], "text/plain": [ " node_id label feat \\\n", - "4011 p4011 4 [0.012342263, -0.01471429, -0.012913096, 0.007... \n", - "11379 p11379 12 [-0.012718345, 0.020719944, -0.010691697, 0.00... \n", - "9401 p9401 8 [-0.013923097, 0.017362924, -0.009770028, -0.0... \n", - "4928 p4928 1 [0.019353714, 0.0066366955, 0.0115322415, 0.01... \n", + "6933 p6933 3 [-0.006179405, 0.010796122, -0.018994818, -0.0... \n", + "743 p743 4 [-0.016835907, -0.020954693, 0.009945098, -0.0... \n", + "11497 p11497 12 [0.009553924, 0.019706111, 0.013354154, -0.010... \n", + "2588 p2588 2 [0.0036002623, -0.007723761, -0.012699484, -0.... \n", "\n", " text \n", - "4011 'User behavior driven ranking without editoria... \n", - "11379 'Reducing truth-telling online mechanisms to o... \n", - "9401 'The lazy adversary conjecture fails We prove ... \n", - "4928 'Privacy preserving schema and data matching ... " + "6933 'User-oriented text segmentation evaluation me... \n", + "743 'Similarity-aware indexing for real-time entit... \n", + "11497 'Polynomial time algorithm for computing the t... \n", + "2588 'Microformats: a pragmatic path to the semanti... " ] }, "execution_count": 11, @@ -431,27 +431,27 @@ " \n", " \n", " \n", - " 28779\n", - " p11255\n", - " p12232\n", + " 1140\n", + " p241\n", + " p6987\n", " 1.0\n", " \n", " \n", - " 2791\n", - " p704\n", - " p6747\n", + " 17361\n", + " p6296\n", + " p6221\n", " 1.0\n", " \n", " \n", - " 429\n", - " p119\n", - " p16\n", + " 21762\n", + " p7578\n", + " p7328\n", " 1.0\n", " \n", " \n", - " 7301\n", - " p2354\n", - " p8747\n", + " 28630\n", + " p11144\n", + " p11145\n", " 1.0\n", " \n", " \n", @@ -460,10 +460,10 @@ ], "text/plain": [ " source_id dest_id label\n", - "28779 p11255 p12232 1.0\n", - "2791 p704 p6747 1.0\n", - "429 p119 p16 1.0\n", - "7301 p2354 p8747 1.0" + "1140 p241 p6987 1.0\n", + "17361 p6296 p6221 1.0\n", + "21762 p7578 p7328 1.0\n", + "28630 p11144 p11145 1.0" ] }, "execution_count": 12, @@ -488,14 +488,14 @@ "In the above cells, we created a 1-partition graph in the `acm_gs_1p` folder and a 3-partition graph in the `acm_gs_3p` folder. The contents of the two folders are nearly the same, including \n", "\n", "1. a GraphStorm partitioned configuration JSON file;\n", - "2. original node id space to GraphStorm node id space mapping files, created during graph processing;\n", + "2. a subfolder named after `raw_id_mappings` that store the original node id space to GraphStorm node id space mapping files, created during graph processing;\n", "3. GraphStorm node id space to shuffle node id space mapping, created during graph patitioning;\n", "4. label statitic files." ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "id": "86ef095b-5997-46e4-a8a1-99f1db61de57", "metadata": {}, "outputs": [ @@ -503,28 +503,24 @@ "name": "stdout", "output_type": "stream", "text": [ - "total 1884\n", - "drwxrwxr-x 3 ubuntu ubuntu 4096 Dec 19 21:28 .\n", - "drwxrwxr-x 6 ubuntu ubuntu 4096 Dec 19 21:29 ..\n", - "-rw-rw-r-- 1 ubuntu ubuntu 1673 Dec 19 21:28 acm.json\n", - "-rw-rw-r-- 1 ubuntu ubuntu 213402 Dec 19 21:28 author_id_remap.parquet\n", - "-rw-rw-r-- 1 ubuntu ubuntu 191 Dec 19 21:28 edge_label_stats.json\n", - "-rw-rw-r-- 1 ubuntu ubuntu 1287802 Dec 19 21:28 edge_mapping.pt\n", - "-rw-rw-r-- 1 ubuntu ubuntu 515 Dec 19 21:28 node_label_stats.json\n", - "-rw-rw-r-- 1 ubuntu ubuntu 241655 Dec 19 21:28 node_mapping.pt\n", - "-rw-rw-r-- 1 ubuntu ubuntu 150409 Dec 19 21:28 paper_id_remap.parquet\n", - "drwxrwxr-x 2 ubuntu ubuntu 4096 Dec 19 21:28 part0\n", - "-rw-rw-r-- 1 ubuntu ubuntu 2934 Dec 19 21:28 subject_id_remap.parquet\n" + "total 1516\n", + "-rw-rw-r-- 1 ubuntu ubuntu 1673 May 15 23:30 acm.json\n", + "-rw-rw-r-- 1 ubuntu ubuntu 191 May 15 23:30 edge_label_stats.json\n", + "-rw-rw-r-- 1 ubuntu ubuntu 1287802 May 15 23:30 edge_mapping.pt\n", + "-rw-rw-r-- 1 ubuntu ubuntu 515 May 15 23:30 node_label_stats.json\n", + "-rw-rw-r-- 1 ubuntu ubuntu 241655 May 15 23:30 node_mapping.pt\n", + "drwxrwxr-x 2 ubuntu ubuntu 4096 May 15 23:30 part0\n", + "drwxrwxr-x 5 ubuntu ubuntu 4096 May 15 23:30 raw_id_mappings\n" ] } ], "source": [ - "!ls -al ./acm_gs_1p" + "!ls -l ./acm_gs_1p" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "id": "d5a11133-75ef-4299-b167-81498c4f1dc4", "metadata": {}, "outputs": [ @@ -532,25 +528,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "total 1892\n", - "drwxrwxr-x 5 ubuntu ubuntu 4096 Dec 19 21:29 .\n", - "drwxrwxr-x 6 ubuntu ubuntu 4096 Dec 19 21:29 ..\n", - "-rw-rw-r-- 1 ubuntu ubuntu 3319 Dec 19 21:29 acm.json\n", - "-rw-rw-r-- 1 ubuntu ubuntu 213402 Dec 19 21:29 author_id_remap.parquet\n", - "-rw-rw-r-- 1 ubuntu ubuntu 191 Dec 19 21:29 edge_label_stats.json\n", - "-rw-rw-r-- 1 ubuntu ubuntu 1287802 Dec 19 21:29 edge_mapping.pt\n", - "-rw-rw-r-- 1 ubuntu ubuntu 515 Dec 19 21:29 node_label_stats.json\n", - "-rw-rw-r-- 1 ubuntu ubuntu 241655 Dec 19 21:29 node_mapping.pt\n", - "-rw-rw-r-- 1 ubuntu ubuntu 150409 Dec 19 21:29 paper_id_remap.parquet\n", - "drwxrwxr-x 2 ubuntu ubuntu 4096 Dec 19 21:29 part0\n", - "drwxrwxr-x 2 ubuntu ubuntu 4096 Dec 19 21:29 part1\n", - "drwxrwxr-x 2 ubuntu ubuntu 4096 Dec 19 21:29 part2\n", - "-rw-rw-r-- 1 ubuntu ubuntu 2934 Dec 19 21:29 subject_id_remap.parquet\n" + "total 1524\n", + "-rw-rw-r-- 1 ubuntu ubuntu 3325 May 15 23:30 acm.json\n", + "-rw-rw-r-- 1 ubuntu ubuntu 191 May 15 23:30 edge_label_stats.json\n", + "-rw-rw-r-- 1 ubuntu ubuntu 1287802 May 15 23:30 edge_mapping.pt\n", + "-rw-rw-r-- 1 ubuntu ubuntu 515 May 15 23:30 node_label_stats.json\n", + "-rw-rw-r-- 1 ubuntu ubuntu 241655 May 15 23:30 node_mapping.pt\n", + "drwxrwxr-x 2 ubuntu ubuntu 4096 May 15 23:30 part0\n", + "drwxrwxr-x 2 ubuntu ubuntu 4096 May 15 23:30 part1\n", + "drwxrwxr-x 2 ubuntu ubuntu 4096 May 15 23:30 part2\n", + "drwxrwxr-x 5 ubuntu ubuntu 4096 May 15 23:30 raw_id_mappings\n" ] } ], "source": [ - "!ls -al ./acm_gs_3p" + "!ls -l ./acm_gs_3p" ] }, { @@ -577,7 +569,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 30, "id": "5f63bcba-fe24-4890-b232-4f9208886f02", "metadata": { "scrolled": true @@ -592,13 +584,61 @@ "id": "c7c84137-0f41-4287-984f-a7358372fde6", "metadata": {}, "source": [ - "#### Raw Node ID Mapping Files `****_id_remap.parquet`\n", - "Because the original node ids could be any types, e.g., strings, integers, or even floats, during graph processing GraphStorm conducts an ID mapping, which map the original node ID space given by users into the interger type node ID space, starting from 0. This mapping information is stored in those `****_id_remap.parquet` files." + "#### Raw Node ID Mapping Files in the `raw_id_remappings` Folder\n", + "Because the original node ids could be any types, e.g., strings, integers, or even floats, during graph processing GraphStorm conducts an ID mapping, which map the original node ID space given by users into the interger type node ID space, starting from 0. This mapping information is stored in the `raw_id_remappings` folder that contains a set of subfolders named after each node type name." ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, + "id": "3e1f4f36-67d1-46f7-b6a8-cafb4b765fc4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 12\n", + "drwxrwxr-x 2 ubuntu ubuntu 4096 May 15 23:30 author\n", + "drwxrwxr-x 2 ubuntu ubuntu 4096 May 15 23:30 paper\n", + "drwxrwxr-x 2 ubuntu ubuntu 4096 May 15 23:30 subject\n" + ] + } + ], + "source": [ + "!ls -l ./acm_gs_3p/raw_id_mappings/" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "eeb3791d-66c4-40d7-94b3-20ad7dfac8ca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 208\n", + "-rw-rw-r-- 1 ubuntu ubuntu 212064 May 15 23:30 part-00000.parquet\n" + ] + } + ], + "source": [ + "!ls -l ./acm_gs_3p/raw_id_mappings/author/" + ] + }, + { + "cell_type": "markdown", + "id": "597434ba-7b0e-41ce-a75e-9f61c91fce77", + "metadata": {}, + "source": [ + "In each subfolder, there will be a set of parquet files with names in the format as `part-*****.parquet`. The number of these parquet files are determined by the number of nodes in each type. The greater the number of nodes, the more files there will be. Users can use any parquet file exploration tools to check their contents like the below code does." + ] + }, + { + "cell_type": "code", + "execution_count": 23, "id": "bf47dee7-aabd-4d18-860b-218665a488a9", "metadata": {}, "outputs": [ @@ -636,24 +676,24 @@ " \n", " \n", " \n", - " 765\n", - " a765\n", - " 765\n", + " 7958\n", + " a7958\n", + " 7958\n", " \n", " \n", - " 8438\n", - " a8438\n", - " 8438\n", + " 7475\n", + " a7475\n", + " 7475\n", " \n", " \n", - " 14914\n", - " a14914\n", - " 14914\n", + " 13423\n", + " a13423\n", + " 13423\n", " \n", " \n", - " 5227\n", - " a5227\n", - " 5227\n", + " 9246\n", + " a9246\n", + " 9246\n", " \n", " \n", "\n", @@ -661,19 +701,19 @@ ], "text/plain": [ " orig new\n", - "765 a765 765\n", - "8438 a8438 8438\n", - "14914 a14914 14914\n", - "5227 a5227 5227" + "7958 a7958 7958\n", + "7475 a7475 7475\n", + "13423 a13423 13423\n", + "9246 a9246 9246" ] }, - "execution_count": 16, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "author_nid_mapping_df = pd.read_parquet('./acm_gs_3p/author_id_remap.parquet')\n", + "author_nid_mapping_df = pd.read_parquet('./acm_gs_3p/raw_id_mappings/author/part-00000.parquet')\n", "\n", "print(author_nid_mapping_df.shape)\n", "author_nid_mapping_df.sample(4)" @@ -684,7 +724,7 @@ "id": "249a6a09-489a-4e45-a101-5ba459ffa50c", "metadata": {}, "source": [ - "As shown above, the `author_id_remap.parquet` file has two columns. The `orig` column contains the original string type node IDs in the raw node table data, while the `new` column contains the new integer node IDs in the Graph Node ID space." + "As shown above, the `author/part-00000.parquet` file has two columns. The `orig` column contains the original string type node IDs in the raw node table data, while the `new` column contains the new integer node IDs in the Graph Node ID space." ] }, { @@ -703,7 +743,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 24, "id": "96b2fa7e-cf62-44f0-8f14-714b472e1c17", "metadata": {}, "outputs": [ @@ -713,11 +753,11 @@ "text": [ "Node id mapping:\n", "Node mapping keys: ['author', 'paper', 'subject']\n", - "Node type 'author' first 10 mapping ids: tensor([16442, 7664, 7665, 7667, 16448, 7669, 7670, 16443, 7674, 16453])\n", + "Node type 'author' first 10 mapping ids: tensor([9908, 5644, 5645, 5646, 5647, 5648, 5649, 5650, 9270, 5643])\n", "\n", "Edge id mapping:\n", "Edge mapping keys: [('author', 'writing', 'paper'), ('paper', 'cited', 'paper'), ('paper', 'citing', 'paper'), ('paper', 'is-about', 'subject'), ('paper', 'written-by', 'author'), ('subject', 'has', 'paper')]\n", - "Edge type '('author', 'writing', 'paper')' first 10 mapping ids: tensor([ 8198, 15018, 3479, 253, 21728, 20622, 15980, 13148, 11788, 9858])\n", + "Edge type '('author', 'writing', 'paper')' first 10 mapping ids: tensor([ 1622, 16688, 22176, 35837, 22116, 22183, 22234, 3538, 9921, 1062])\n", "\n" ] } @@ -743,7 +783,9 @@ "id": "80ff4d2a", "metadata": {}, "source": [ - "The ID mapping logic in those tensors is that GraphStorm graph ID is stored in these tensors, and their position indexes are the new partitioned node IDs. For example, for \"author\" nodes, the GraphStorm graph ID `16442` has a new partitioned node ID `0` because the number `16642` is in the first position (index=`0`) of the mapping tensor." + "The ID mapping logic in those tensors is that GraphStorm graph ID is stored in these tensors, and their position indexes are the new partitioned node IDs. For example, for \"author\" nodes, the GraphStorm graph ID `9908` has a new partitioned node ID `0` because the number `9908` is in the first position (index=`0`) of the mapping tensor.\n", + "
\n", + "warning: The specific number of the first author node ID might not be the 9908 as partition process is not determistic. Users may see author node IDs different from the given example.
" ] }, {