From 0238691108ddff83289456d1ecde4a358552def0 Mon Sep 17 00:00:00 2001 From: YanxuanLiu Date: Tue, 5 Nov 2024 15:39:50 +0800 Subject: [PATCH 1/4] change data parh Signed-off-by: YanxuanLiu --- .../notebooks/micro-benchmarks-gpu.ipynb | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb index a7f9780d..c91d6a3a 100644 --- a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb +++ b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb @@ -111,12 +111,12 @@ "# Load dataframe and create tempView\n", "# You need to update data path to your real path!\n", "dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n", - "spark.read.parquet(dataRoot + \"/tpcds/customer\").createOrReplaceTempView(\"customer\")\n", - "spark.read.parquet(dataRoot + \"/tpcds/store_sales\").createOrReplaceTempView(\"store_sales\")\n", - "spark.read.parquet(dataRoot + \"/tpcds/catalog_sales\").createOrReplaceTempView(\"catalog_sales\")\n", - "spark.read.parquet(dataRoot + \"/tpcds/web_sales\").createOrReplaceTempView(\"web_sales\")\n", - "spark.read.parquet(dataRoot + \"/tpcds/item\").createOrReplaceTempView(\"item\")\n", - "spark.read.parquet(dataRoot + \"/tpcds/date_dim\").createOrReplaceTempView(\"date_dim\")\n", + "spark.read.parquet(dataRoot + \"/customer.dat\").createOrReplaceTempView(\"customer\")\n", + "spark.read.parquet(dataRoot + \"/store_sales.dat\").createOrReplaceTempView(\"store_sales\")\n", + "spark.read.parquet(dataRoot + \"/catalog_sales.dat\").createOrReplaceTempView(\"catalog_sales\")\n", + "spark.read.parquet(dataRoot + \"/web_sales.dat\").createOrReplaceTempView(\"web_sales\")\n", + "spark.read.parquet(dataRoot + \"/item.dat\").createOrReplaceTempView(\"item\")\n", + "spark.read.parquet(dataRoot + \"/date_dim.dat\").createOrReplaceTempView(\"date_dim\")\n", "print(\"-\"*50)" ] }, From cbb317303688a39507e6dc1066126c19ac9ec314 Mon Sep 17 00:00:00 2001 From: YanxuanLiu Date: Tue, 5 Nov 2024 21:14:05 +0800 Subject: [PATCH 2/4] change data path Signed-off-by: YanxuanLiu --- .../micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb index c91d6a3a..9d8a6857 100644 --- a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb +++ b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb @@ -497,7 +497,7 @@ ], "source": [ "start = time() \n", - "spark.read.parquet(dataRoot + \"/tpcds/customer\").limit(1000000).write.format(\"parquet\").mode(\"overwrite\").save(\"/data/tmp/customer1m\")\n", + "spark.read.parquet(dataRoot + \"/customer.dat\").limit(1000000).write.format(\"parquet\").mode(\"overwrite\").save(\"/data/tmp/customer1m\")\n", "end = time()\n", "# Parquet file scanning and writing will be about 3 times faster running on GPU\n", "print(\"scanning and writing parquet cost : {} seconds\".format(round(end - start, 2)))\n", @@ -557,8 +557,8 @@ "metadata": {}, "outputs": [], "source": [ - "spark.read.parquet(dataRoot + \"/tpcds/store_sales\").createOrReplaceTempView(\"store_sales\")\n", - "spark.read.parquet(dataRoot + \"/tpcds/store_returns\").createOrReplaceTempView(\"store_returns\")\n", + "spark.read.parquet(dataRoot + \"/store_sales.dat\").createOrReplaceTempView(\"store_sales\")\n", + "spark.read.parquet(dataRoot + \"/store_returns.dat\").createOrReplaceTempView(\"store_returns\")\n", "\n", "print(\"-\"*50)\n", "query = '''\n", From db50e4a2c911879cee1d66ad88685914d2f083bb Mon Sep 17 00:00:00 2001 From: YanxuanLiu Date: Wed, 6 Nov 2024 10:17:28 +0800 Subject: [PATCH 3/4] save result to file Signed-off-by: YanxuanLiu --- .../micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb index 9d8a6857..dadd159c 100644 --- a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb +++ b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb @@ -52,7 +52,10 @@ " total_time += round(end - start, 2)\n", " count = count + 1\n", " print(\"Retry times : {}, \".format(count) + appName + \" microbenchmark takes {} seconds\".format(round(end - start, 2)))\n", - " print(appName + \" microbenchmark takes average {} seconds after {} retries\".format(round(total_time/retryTimes),retryTimes))" + " res = appName + \" microbenchmark takes average {} seconds after {} retries\\n\".format(round(total_time/retryTimes),retryTimes)\n", + " print(res)\n", + " with open('result.txt', 'a') as file:\n", + " file.write(res)" ] }, { From f040041935684df9bd30fc85ba7ce6ecd634b3bf Mon Sep 17 00:00:00 2001 From: YanxuanLiu Date: Wed, 6 Nov 2024 10:26:10 +0800 Subject: [PATCH 4/4] change format of output Signed-off-by: YanxuanLiu --- .../micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb index dadd159c..93c71736 100644 --- a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb +++ b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb @@ -52,10 +52,9 @@ " total_time += round(end - start, 2)\n", " count = count + 1\n", " print(\"Retry times : {}, \".format(count) + appName + \" microbenchmark takes {} seconds\".format(round(end - start, 2)))\n", - " res = appName + \" microbenchmark takes average {} seconds after {} retries\\n\".format(round(total_time/retryTimes),retryTimes)\n", - " print(res)\n", + " print(appName + \" microbenchmark takes average {} seconds after {} retries\".format(round(total_time/retryTimes),retryTimes))\n", " with open('result.txt', 'a') as file:\n", - " file.write(res)" + " file.write(\"{},{},{}\\n\".format(appName, round(total_time/retryTimes), retryTimes))" ] }, {