diff --git a/examples/SQL+DF-Examples/tpcds/notebooks/TPCDS-SF10.ipynb b/examples/SQL+DF-Examples/tpcds/notebooks/TPCDS-SF10.ipynb index a081a889..6aa90e6d 100644 --- a/examples/SQL+DF-Examples/tpcds/notebooks/TPCDS-SF10.ipynb +++ b/examples/SQL+DF-Examples/tpcds/notebooks/TPCDS-SF10.ipynb @@ -11,7 +11,7 @@ "tags": [] }, "source": [ - "# TPC DS scale factor 10 - Apache Spark acceleration on GPU with RAPIDS Spark\n", + "# TPC-DS 10GiB - Apache Spark acceleration on GPU with RAPIDS Spark\n", "\n", "based on https://colab.research.google.com/github/LucaCanali/Miscellaneous/blob/master/Performance_Testing/TPCDS_PySpark/Labs_and_Notes/TPCDS_PySpark_getstarted.ipynb#scrollTo=6bab7772" ] @@ -30,13 +30,12 @@ "outputs": [], "source": [ "spark_version='3.5.0'\n", - "rapids_version='24.10.0'\n", - "scala_version='2.12'" + "rapids_version='24.10.0'" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { "executionInfo": { "elapsed": 1630, @@ -77,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": { "executionInfo": { "elapsed": 1052, @@ -96,32 +95,13 @@ "from importlib.resources import files\n", "from pyspark.sql import SparkSession\n", "from tpcds_pyspark import TPCDS\n", + "import glob\n", "import os\n", "import pandas as pd\n", + "import re\n", "import time" ] }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "executionInfo": { - "elapsed": 12, - "status": "ok", - "timestamp": 1729291037060, - "user": { - "displayName": "Gera Shegalov", - "userId": "07399839501144323282" - }, - "user_tz": 420 - }, - "id": "CWaokrdGfsyo" - }, - "outputs": [], - "source": [ - "tpcds_pyspark_files = files('tpcds_pyspark')" - ] - }, { "cell_type": "markdown", "metadata": { @@ -164,9 +144,48 @@ "# Init a SparkSession with RAPIDS Spark" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Detect Scala Version used in PySpark package" + ] + }, { "cell_type": "code", "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "pyspark_files = files('pyspark')\n", + "spark_sql_jar_path, *_ = glob.glob(f\"{pyspark_files}/*/spark-sql_*jar\")\n", + "spark_sql_jar = os.path.basename(spark_sql_jar_path)\n", + "scala_version = re.search(r'^spark-sql_(\\d+.\\d+)-.*\\.jar$', spark_sql_jar).group(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Find spark-measure artifact" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "tpcds_pyspark_files = files('tpcds_pyspark')\n", + "spark_measure_jar_paths = glob.glob(f\"{tpcds_pyspark_files}/spark-measure_{scala_version}-*.jar\")\n", + "assert spark_measure_jar_paths, f\"No spark-measure artifact built for Pyspark's Scala version {scala_version}\"\n", + "spark_measure_jar_paths.sort(reverse=True)\n", + "spark_measure_jar_path, *_ = spark_measure_jar_paths" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "metadata": { "executionInfo": { "elapsed": 39420, @@ -185,8 +204,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "24/10/22 20:24:36 WARN Utils: Your hostname, e780a48-lcedt resolves to a loopback address: 127.0.1.1; using 10.112.215.249 instead (on interface enp36s0f0)\n", - "24/10/22 20:24:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n" + "24/10/23 14:46:45 WARN Utils: Your hostname, e780a48-lcedt resolves to a loopback address: 127.0.1.1; using 10.112.215.249 instead (on interface enp36s0f0)\n", + "24/10/23 14:46:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n" ] }, { @@ -203,10 +222,10 @@ "Ivy Default Cache set to: /home/gshegalov/.ivy2/cache\n", "The jars for the packages stored in: /home/gshegalov/.ivy2/jars\n", "com.nvidia#rapids-4-spark_2.12 added as a dependency\n", - ":: resolving dependencies :: org.apache.spark#spark-submit-parent-df1f6219-409d-4ff8-8387-c5192908c474;1.0\n", + ":: resolving dependencies :: org.apache.spark#spark-submit-parent-93f929a5-5993-4f3d-9421-bd096589e319;1.0\n", "\tconfs: [default]\n", "\tfound com.nvidia#rapids-4-spark_2.12;24.10.0 in central\n", - ":: resolution report :: resolve 73ms :: artifacts dl 2ms\n", + ":: resolution report :: resolve 69ms :: artifacts dl 3ms\n", "\t:: modules in use:\n", "\tcom.nvidia#rapids-4-spark_2.12;24.10.0 from central in [default]\n", "\t---------------------------------------------------------------------\n", @@ -215,15 +234,15 @@ "\t---------------------------------------------------------------------\n", "\t| default | 1 | 0 | 0 | 0 || 1 | 0 |\n", "\t---------------------------------------------------------------------\n", - ":: retrieving :: org.apache.spark#spark-submit-parent-df1f6219-409d-4ff8-8387-c5192908c474\n", + ":: retrieving :: org.apache.spark#spark-submit-parent-93f929a5-5993-4f3d-9421-bd096589e319\n", "\tconfs: [default]\n", "\t0 artifacts copied, 1 already retrieved (0kB/2ms)\n", - "24/10/22 20:24:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", + "24/10/23 14:46:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", - "24/10/22 20:24:38 WARN RapidsPluginUtils: RAPIDS Accelerator 24.10.0 using cudf 24.10.0, private revision bd4e99e18e20234ee0c54f95f4b0bfce18a6255e\n", - "24/10/22 20:24:38 WARN RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n", - "24/10/22 20:24:38 WARN RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n" + "24/10/23 14:46:47 WARN RapidsPluginUtils: RAPIDS Accelerator 24.10.0 using cudf 24.10.0, private revision bd4e99e18e20234ee0c54f95f4b0bfce18a6255e\n", + "24/10/23 14:46:47 WARN RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n", + "24/10/23 14:46:47 WARN RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n" ] } ], @@ -233,7 +252,7 @@ " .appName('TPCDS PySpark RAPIDS=ON/OFF')\n", " .config('spark.driver.memory', '5g')\n", " .config('spark.plugins', 'com.nvidia.spark.SQLPlugin')\n", - " .config('spark.jars', tpcds_pyspark_files.joinpath(f\"spark-measure_{scala_version}-0.24.jar\"))\n", + " .config('spark.jars', spark_measure_jar_path)\n", " .config('spark.jars.packages', f\"com.nvidia:rapids-4-spark_{scala_version}:{rapids_version}\")\n", " .getOrCreate()\n", ")\n" @@ -245,12 +264,12 @@ "id": "_4sYje2NiNA7" }, "source": [ - "# Verify SQL Acceleration on GPU can be enabled by checking the query plan and the GPU RAM in the \"Resources\" tab" + "# Verify SQL Acceleration on GPU can be enabled by checking the query plan" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -315,39 +334,6 @@ "sum_df.explain()" ] }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "executionInfo": { - "elapsed": 5, - "status": "ok", - "timestamp": 1729289104337, - "user": { - "displayName": "Gera Shegalov", - "userId": "07399839501144323282" - }, - "user_tz": 420 - }, - "id": "t7kj9OQnPuWA" - }, - "outputs": [], - "source": [ - "# https://github.com/LucaCanali/Miscellaneous/tree/master/Performance_Testing/TPCDS_PySpark/tpcds_pyspark/Queries\n", - "\n", - "# queries = None to run all (takes much longer)\n", - "queries = None\n", - "queries = [\n", - " 'q14a',\n", - " 'q14b',\n", - " 'q23a',\n", - " 'q23b',\n", - " # 'q24a',\n", - " # 'q24b',\n", - " # 'q88',\n", - "]\n" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -357,7 +343,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -389,11 +375,25 @@ "name": "stderr", "output_type": "stream", "text": [ - "24/10/22 20:24:46 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n" + "24/10/23 14:47:27 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n" ] } ], "source": [ + "# https://github.com/LucaCanali/Miscellaneous/tree/master/Performance_Testing/TPCDS_PySpark/tpcds_pyspark/Queries\n", + "\n", + "# queries = None to run all (takes much longer)\n", + "queries = None\n", + "queries = [\n", + " 'q14a',\n", + " 'q14b',\n", + " 'q23a',\n", + " 'q23b',\n", + " # 'q24a',\n", + " # 'q24b',\n", + " # 'q88',\n", + "]\n", + "\n", "demo_start = time.time()\n", "tpcds = TPCDS(data_path='./tpcds_10', num_runs=1, queries_repeat_times=1, queries=queries)" ] @@ -409,7 +409,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -440,7 +440,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "24/10/22 20:24:47 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n" + "24/10/23 14:47:33 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n" ] }, { @@ -488,7 +488,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -528,12 +528,12 @@ "output_type": "stream", "text": [ "Job finished\n", - "...Start Time = 2024-10-22 20:24:49\n", - "...Elapsed Time = 9.78 sec\n", - "...Executors Run Time = 160.46 sec\n", - "...Executors CPU Time = 89.63 sec\n", - "...Executors JVM GC Time = 34.83 sec\n", - "...Average Active Tasks = 16.4\n", + "...Start Time = 2024-10-23 14:47:40\n", + "...Elapsed Time = 9.31 sec\n", + "...Executors Run Time = 149.86 sec\n", + "...Executors CPU Time = 85.31 sec\n", + "...Executors JVM GC Time = 30.7 sec\n", + "...Average Active Tasks = 16.1\n", "\n", "Run 0 - query q14b - attempt 0 - starting...\n" ] @@ -550,11 +550,11 @@ "output_type": "stream", "text": [ "Job finished\n", - "...Start Time = 2024-10-22 20:25:03\n", - "...Elapsed Time = 5.61 sec\n", - "...Executors Run Time = 97.64 sec\n", - "...Executors CPU Time = 58.69 sec\n", - "...Executors JVM GC Time = 25.94 sec\n", + "...Start Time = 2024-10-23 14:47:53\n", + "...Elapsed Time = 5.27 sec\n", + "...Executors Run Time = 91.68 sec\n", + "...Executors CPU Time = 55.95 sec\n", + "...Executors JVM GC Time = 24.74 sec\n", "...Average Active Tasks = 17.4\n", "\n", "Run 0 - query q23a - attempt 0 - starting...\n" @@ -572,11 +572,11 @@ "output_type": "stream", "text": [ "Job finished\n", - "...Start Time = 2024-10-22 20:25:10\n", - "...Elapsed Time = 8.77 sec\n", - "...Executors Run Time = 201.08 sec\n", - "...Executors CPU Time = 142.97 sec\n", - "...Executors JVM GC Time = 40.97 sec\n", + "...Start Time = 2024-10-23 14:48:00\n", + "...Elapsed Time = 8.73 sec\n", + "...Executors Run Time = 199.52 sec\n", + "...Executors CPU Time = 138.3 sec\n", + "...Executors JVM GC Time = 42.51 sec\n", "...Average Active Tasks = 22.9\n", "\n", "Run 0 - query q23b - attempt 0 - starting...\n" @@ -586,8 +586,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "24/10/22 20:25:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n", - "[Stage 218:=> (2 + 64) / 66]\r" + "[Stage 218:> (0 + 64) / 66]\r" ] }, { @@ -595,22 +594,21 @@ "output_type": "stream", "text": [ "Job finished\n", - "...Start Time = 2024-10-22 20:25:20\n", - "...Elapsed Time = 10.68 sec\n", - "...Executors Run Time = 241.66 sec\n", - "...Executors CPU Time = 157.55 sec\n", - "...Executors JVM GC Time = 60.73 sec\n", - "...Average Active Tasks = 22.6\n", - "CPU times: user 103 ms, sys: 37.3 ms, total: 141 ms\n", - "Wall time: 42.8 s\n" + "...Start Time = 2024-10-23 14:48:10\n", + "...Elapsed Time = 8.83 sec\n", + "...Executors Run Time = 224.27 sec\n", + "...Executors CPU Time = 145.62 sec\n", + "...Executors JVM GC Time = 57.8 sec\n", + "...Average Active Tasks = 25.4\n", + "CPU times: user 85.1 ms, sys: 25.7 ms, total: 111 ms\n", + "Wall time: 40.3 s\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "/home/gshegalov/gits/NVIDIA/spark-rapids-examples/.venv/lib/python3.10/site-packages/tpcds_pyspark/tpcds.py:243: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", - " results_pdf['timestamp'] = pd.to_datetime(results_pdf['timestamp'])\n" + " \r" ] }, { @@ -663,13 +661,13 @@ " q14a\n", " 30\n", " 838\n", - " 9779\n", - " 19488\n", - " 160456\n", - " 89631\n", - " 9277\n", - " 3040\n", - " 362\n", + " 9308\n", + " 18042\n", + " 149861\n", + " 85308\n", + " 7838\n", + " 2915\n", + " 271\n", " ...\n", " 551\n", " 0\n", @@ -687,13 +685,13 @@ " q14b\n", " 24\n", " 636\n", - " 5608\n", - " 8704\n", - " 97644\n", - " 58687\n", - " 2565\n", - " 1649\n", - " 210\n", + " 5272\n", + " 8360\n", + " 91678\n", + " 55951\n", + " 2927\n", + " 1578\n", + " 218\n", " ...\n", " 513\n", " 0\n", @@ -711,13 +709,13 @@ " q23a\n", " 18\n", " 621\n", - " 8765\n", - " 15382\n", - " 201084\n", - " 142969\n", - " 3432\n", - " 1429\n", - " 134\n", + " 8730\n", + " 15069\n", + " 199518\n", + " 138301\n", + " 2173\n", + " 1338\n", + " 107\n", " ...\n", " 2269\n", " 0\n", @@ -735,24 +733,24 @@ " q23b\n", " 21\n", " 690\n", - " 10684\n", - " 19596\n", - " 241665\n", - " 157549\n", - " 3374\n", - " 1718\n", - " 192\n", + " 8831\n", + " 17842\n", + " 224270\n", + " 145622\n", + " 3162\n", + " 1751\n", + " 120\n", " ...\n", " 4779\n", " 0\n", - " 1194344589\n", - " 1194344589\n", + " 1200330085\n", + " 1200330085\n", " 0\n", " 0\n", - " 1091584844\n", + " 1097570340\n", " 42452502\n", - " 22\n", - " 10\n", + " 25\n", + " 8\n", " \n", " \n", "\n", @@ -761,45 +759,45 @@ ], "text/plain": [ " query numStages numTasks elapsedTime stageDuration executorRunTime \\\n", - "0 q14a 30 838 9779 19488 160456 \n", - "1 q14b 24 636 5608 8704 97644 \n", - "2 q23a 18 621 8765 15382 201084 \n", - "3 q23b 21 690 10684 19596 241665 \n", + "0 q14a 30 838 9308 18042 149861 \n", + "1 q14b 24 636 5272 8360 91678 \n", + "2 q23a 18 621 8730 15069 199518 \n", + "3 q23b 21 690 8831 17842 224270 \n", "\n", " executorCpuTime executorDeserializeTime executorDeserializeCpuTime \\\n", - "0 89631 9277 3040 \n", - "1 58687 2565 1649 \n", - "2 142969 3432 1429 \n", - "3 157549 3374 1718 \n", + "0 85308 7838 2915 \n", + "1 55951 2927 1578 \n", + "2 138301 2173 1338 \n", + "3 145622 3162 1751 \n", "\n", " resultSerializationTime ... shuffleLocalBlocksFetched \\\n", - "0 362 ... 551 \n", - "1 210 ... 513 \n", - "2 134 ... 2269 \n", - "3 192 ... 4779 \n", + "0 271 ... 551 \n", + "1 218 ... 513 \n", + "2 107 ... 2269 \n", + "3 120 ... 4779 \n", "\n", " shuffleRemoteBlocksFetched shuffleTotalBytesRead shuffleLocalBytesRead \\\n", "0 0 878437913 878437913 \n", "1 0 1013592969 1013592969 \n", "2 0 1115089630 1115089630 \n", - "3 0 1194344589 1194344589 \n", + "3 0 1200330085 1200330085 \n", "\n", " shuffleRemoteBytesRead shuffleRemoteBytesReadToDisk shuffleBytesWritten \\\n", "0 0 0 875325021 \n", "1 0 0 529675847 \n", "2 0 0 1085198863 \n", - "3 0 0 1091584844 \n", + "3 0 0 1097570340 \n", "\n", " shuffleRecordsWritten avg_active_tasks elapsed_time_seconds \n", "0 62516924 16 9 \n", "1 40865273 17 5 \n", "2 41990073 22 8 \n", - "3 42452502 22 10 \n", + "3 42452502 25 8 \n", "\n", "[4 rows x 33 columns]" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -822,7 +820,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -854,31 +852,32 @@ "name": "stderr", "output_type": "stream", "text": [ - "24/10/22 20:25:34 WARN GpuOverrides: \n", + "24/10/23 14:48:22 WARN GpuOverrides: \n", "! cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.datasources.v2.OverwriteByExpressionExec\n", "\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - "24/10/22 20:25:35 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", - " \r" + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n", + "24/10/23 14:48:23 WARN MultiFileReaderThreadPool: Configuring the file reader thread pool with a max of 64 threads instead of spark.rapids.sql.multiThreadedRead.numThreads = 20\n" ] }, { @@ -886,12 +885,12 @@ "output_type": "stream", "text": [ "Job finished\n", - "...Start Time = 2024-10-22 20:25:32\n", - "...Elapsed Time = 6.63 sec\n", - "...Executors Run Time = 134.13 sec\n", - "...Executors CPU Time = 20.09 sec\n", - "...Executors JVM GC Time = 6.65 sec\n", - "...Average Active Tasks = 20.2\n", + "...Start Time = 2024-10-23 14:48:20\n", + "...Elapsed Time = 6.05 sec\n", + "...Executors Run Time = 123.47 sec\n", + "...Executors CPU Time = 17.23 sec\n", + "...Executors JVM GC Time = 7.43 sec\n", + "...Average Active Tasks = 20.4\n", "\n", "Run 0 - query q14b - attempt 0 - starting...\n" ] @@ -900,7 +899,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "24/10/22 20:25:42 WARN GpuOverrides: \n", + "24/10/23 14:48:30 WARN GpuOverrides: \n", "! cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.datasources.v2.OverwriteByExpressionExec\n", "\n" ] @@ -910,12 +909,12 @@ "output_type": "stream", "text": [ "Job finished\n", - "...Start Time = 2024-10-22 20:25:42\n", - "...Elapsed Time = 2.96 sec\n", - "...Executors Run Time = 90.89 sec\n", - "...Executors CPU Time = 13.46 sec\n", - "...Executors JVM GC Time = 9.7 sec\n", - "...Average Active Tasks = 30.7\n", + "...Start Time = 2024-10-23 14:48:30\n", + "...Elapsed Time = 2.85 sec\n", + "...Executors Run Time = 86.64 sec\n", + "...Executors CPU Time = 12.98 sec\n", + "...Executors JVM GC Time = 7.06 sec\n", + "...Average Active Tasks = 30.4\n", "\n", "Run 0 - query q23a - attempt 0 - starting...\n" ] @@ -924,7 +923,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "24/10/22 20:25:47 WARN GpuOverrides: \n", + "24/10/23 14:48:34 WARN GpuOverrides: \n", "! cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.datasources.v2.OverwriteByExpressionExec\n", "\n", " \r" @@ -935,12 +934,12 @@ "output_type": "stream", "text": [ "Job finished\n", - "...Start Time = 2024-10-22 20:25:46\n", - "...Elapsed Time = 3.33 sec\n", - "...Executors Run Time = 96.84 sec\n", - "...Executors CPU Time = 21.94 sec\n", - "...Executors JVM GC Time = 4.43 sec\n", - "...Average Active Tasks = 29.1\n", + "...Start Time = 2024-10-23 14:48:34\n", + "...Elapsed Time = 3.22 sec\n", + "...Executors Run Time = 98.1 sec\n", + "...Executors CPU Time = 22.08 sec\n", + "...Executors JVM GC Time = 4.58 sec\n", + "...Average Active Tasks = 30.5\n", "\n", "Run 0 - query q23b - attempt 0 - starting...\n" ] @@ -949,10 +948,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "24/10/22 20:25:51 WARN GpuOverrides: \n", + "24/10/23 14:48:39 WARN GpuOverrides: \n", "! cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.datasources.v2.OverwriteByExpressionExec\n", "\n", - "[Stage 420:======================================> (36 + 14) / 50]\r" + "[Stage 420:===============================================> (43 + 7) / 50]\r" ] }, { @@ -960,14 +959,14 @@ "output_type": "stream", "text": [ "Job finished\n", - "...Start Time = 2024-10-22 20:25:51\n", - "...Elapsed Time = 5.0 sec\n", - "...Executors Run Time = 187.15 sec\n", - "...Executors CPU Time = 25.02 sec\n", - "...Executors JVM GC Time = 5.3 sec\n", - "...Average Active Tasks = 37.4\n", - "CPU times: user 60.8 ms, sys: 17.7 ms, total: 78.4 ms\n", - "Wall time: 25.4 s\n" + "...Start Time = 2024-10-23 14:48:38\n", + "...Elapsed Time = 4.79 sec\n", + "...Executors Run Time = 183.46 sec\n", + "...Executors CPU Time = 23.68 sec\n", + "...Executors JVM GC Time = 6.21 sec\n", + "...Average Active Tasks = 38.3\n", + "CPU times: user 45.8 ms, sys: 22.7 ms, total: 68.5 ms\n", + "Wall time: 24.3 s\n" ] }, { @@ -1027,21 +1026,21 @@ " q14a\n", " 30\n", " 862\n", - " 6627\n", - " 12982\n", - " 134135\n", - " 20095\n", - " 7859\n", - " 4073\n", - " 57\n", + " 6048\n", + " 10688\n", + " 123472\n", + " 17226\n", + " 7391\n", + " 3881\n", + " 36\n", " ...\n", " 718\n", " 0\n", - " 696329859\n", - " 696329859\n", + " 696421878\n", + " 696421878\n", " 0\n", " 0\n", - " 693773959\n", + " 693865654\n", " 18794\n", " 20\n", " 6\n", @@ -1051,21 +1050,21 @@ " q14b\n", " 24\n", " 661\n", - " 2959\n", - " 5783\n", - " 90892\n", - " 13457\n", - " 5387\n", - " 2913\n", - " 64\n", + " 2848\n", + " 4761\n", + " 86635\n", + " 12985\n", + " 3519\n", + " 2711\n", + " 86\n", " ...\n", " 695\n", " 0\n", - " 767417490\n", - " 767417490\n", + " 767451618\n", + " 767451618\n", " 0\n", " 0\n", - " 421580618\n", + " 421600742\n", " 15346\n", " 30\n", " 2\n", @@ -1075,23 +1074,23 @@ " q23a\n", " 18\n", " 589\n", - " 3332\n", - " 5160\n", - " 96842\n", - " 21942\n", - " 2211\n", - " 2011\n", - " 32\n", + " 3221\n", + " 5354\n", + " 98104\n", + " 22078\n", + " 2304\n", + " 2092\n", + " 79\n", " ...\n", " 1727\n", " 0\n", - " 897041986\n", - " 897041986\n", + " 897013287\n", + " 897013287\n", " 0\n", " 0\n", - " 878935367\n", + " 878908228\n", " 15223\n", - " 29\n", + " 30\n", " 3\n", " \n", " \n", @@ -1099,24 +1098,24 @@ " q23b\n", " 21\n", " 651\n", - " 5005\n", - " 8439\n", - " 187145\n", - " 25015\n", - " 2687\n", - " 2425\n", - " 43\n", + " 4794\n", + " 8100\n", + " 183457\n", + " 23683\n", + " 4522\n", + " 2527\n", + " 35\n", " ...\n", " 3774\n", " 0\n", - " 952892096\n", - " 952892096\n", + " 952869300\n", + " 952869300\n", " 0\n", " 0\n", - " 888404420\n", + " 888393294\n", " 16352\n", - " 37\n", - " 5\n", + " 38\n", + " 4\n", " \n", " \n", "\n", @@ -1125,45 +1124,45 @@ ], "text/plain": [ " query numStages numTasks elapsedTime stageDuration executorRunTime \\\n", - "0 q14a 30 862 6627 12982 134135 \n", - "1 q14b 24 661 2959 5783 90892 \n", - "2 q23a 18 589 3332 5160 96842 \n", - "3 q23b 21 651 5005 8439 187145 \n", + "0 q14a 30 862 6048 10688 123472 \n", + "1 q14b 24 661 2848 4761 86635 \n", + "2 q23a 18 589 3221 5354 98104 \n", + "3 q23b 21 651 4794 8100 183457 \n", "\n", " executorCpuTime executorDeserializeTime executorDeserializeCpuTime \\\n", - "0 20095 7859 4073 \n", - "1 13457 5387 2913 \n", - "2 21942 2211 2011 \n", - "3 25015 2687 2425 \n", + "0 17226 7391 3881 \n", + "1 12985 3519 2711 \n", + "2 22078 2304 2092 \n", + "3 23683 4522 2527 \n", "\n", " resultSerializationTime ... shuffleLocalBlocksFetched \\\n", - "0 57 ... 718 \n", - "1 64 ... 695 \n", - "2 32 ... 1727 \n", - "3 43 ... 3774 \n", + "0 36 ... 718 \n", + "1 86 ... 695 \n", + "2 79 ... 1727 \n", + "3 35 ... 3774 \n", "\n", " shuffleRemoteBlocksFetched shuffleTotalBytesRead shuffleLocalBytesRead \\\n", - "0 0 696329859 696329859 \n", - "1 0 767417490 767417490 \n", - "2 0 897041986 897041986 \n", - "3 0 952892096 952892096 \n", + "0 0 696421878 696421878 \n", + "1 0 767451618 767451618 \n", + "2 0 897013287 897013287 \n", + "3 0 952869300 952869300 \n", "\n", " shuffleRemoteBytesRead shuffleRemoteBytesReadToDisk shuffleBytesWritten \\\n", - "0 0 0 693773959 \n", - "1 0 0 421580618 \n", - "2 0 0 878935367 \n", - "3 0 0 888404420 \n", + "0 0 0 693865654 \n", + "1 0 0 421600742 \n", + "2 0 0 878908228 \n", + "3 0 0 888393294 \n", "\n", " shuffleRecordsWritten avg_active_tasks elapsed_time_seconds \n", "0 18794 20 6 \n", "1 15346 30 2 \n", - "2 15223 29 3 \n", - "3 16352 37 5 \n", + "2 15223 30 3 \n", + "3 16352 38 4 \n", "\n", "[4 rows x 33 columns]" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1186,7 +1185,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": { "executionInfo": { "elapsed": 5, @@ -1209,14 +1208,14 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU and GPU run took: demo_dur=70.06181907653809 seconds\n" + "CPU and GPU run took: demo_dur=112.37040138244629 seconds\n" ] } ], @@ -1227,7 +1226,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1253,13 +1252,13 @@ "" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -1274,7 +1273,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1300,13 +1299,13 @@ "" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -1330,7 +1329,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 23, "metadata": { "executionInfo": { "elapsed": 4, @@ -1353,7 +1352,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 24, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1509,7 +1508,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 25, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1538,22 +1537,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "[Stage 448:>(5 + 59) / 64][Stage 450:> (0 + 1) / 1][Stage 452:> (0 + 1) / 1]\r" + " \r" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 4.61 ms, sys: 1.07 ms, total: 5.69 ms\n", - "Wall time: 2.01 s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" + "CPU times: user 2.45 ms, sys: 1.97 ms, total: 4.43 ms\n", + "Wall time: 1.66 s\n" ] }, { @@ -1562,7 +1554,7 @@ "[Row(h8_30_to_9=18440, h9_to_9_30=39156, h9_30_to_10=38666, h10_to_10_30=58037, h10_30_to_11=58313, h11_to_11_30=34436, h11_30_to_12=33322, h12_to_12_30=39142)]" ] }, - "execution_count": 19, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" }