Skip to content

Commit

Permalink
Run micro-benchmark on GCP (#464)
Browse files Browse the repository at this point in the history
* change data parh

Signed-off-by: YanxuanLiu <[email protected]>

* change data path

Signed-off-by: YanxuanLiu <[email protected]>

* save result to file

Signed-off-by: YanxuanLiu <[email protected]>

* change format of output

Signed-off-by: YanxuanLiu <[email protected]>

---------

Signed-off-by: YanxuanLiu <[email protected]>
  • Loading branch information
YanxuanLiu authored Nov 7, 2024
1 parent 8f67f60 commit 27ed958
Showing 1 changed file with 12 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@
" total_time += round(end - start, 2)\n",
" count = count + 1\n",
" print(\"Retry times : {}, \".format(count) + appName + \" microbenchmark takes {} seconds\".format(round(end - start, 2)))\n",
" print(appName + \" microbenchmark takes average {} seconds after {} retries\".format(round(total_time/retryTimes),retryTimes))"
" print(appName + \" microbenchmark takes average {} seconds after {} retries\".format(round(total_time/retryTimes),retryTimes))\n",
" with open('result.txt', 'a') as file:\n",
" file.write(\"{},{},{}\\n\".format(appName, round(total_time/retryTimes), retryTimes))"
]
},
{
Expand Down Expand Up @@ -111,12 +113,12 @@
"# Load dataframe and create tempView\n",
"# You need to update data path to your real path!\n",
"dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n",
"spark.read.parquet(dataRoot + \"/tpcds/customer\").createOrReplaceTempView(\"customer\")\n",
"spark.read.parquet(dataRoot + \"/tpcds/store_sales\").createOrReplaceTempView(\"store_sales\")\n",
"spark.read.parquet(dataRoot + \"/tpcds/catalog_sales\").createOrReplaceTempView(\"catalog_sales\")\n",
"spark.read.parquet(dataRoot + \"/tpcds/web_sales\").createOrReplaceTempView(\"web_sales\")\n",
"spark.read.parquet(dataRoot + \"/tpcds/item\").createOrReplaceTempView(\"item\")\n",
"spark.read.parquet(dataRoot + \"/tpcds/date_dim\").createOrReplaceTempView(\"date_dim\")\n",
"spark.read.parquet(dataRoot + \"/customer.dat\").createOrReplaceTempView(\"customer\")\n",
"spark.read.parquet(dataRoot + \"/store_sales.dat\").createOrReplaceTempView(\"store_sales\")\n",
"spark.read.parquet(dataRoot + \"/catalog_sales.dat\").createOrReplaceTempView(\"catalog_sales\")\n",
"spark.read.parquet(dataRoot + \"/web_sales.dat\").createOrReplaceTempView(\"web_sales\")\n",
"spark.read.parquet(dataRoot + \"/item.dat\").createOrReplaceTempView(\"item\")\n",
"spark.read.parquet(dataRoot + \"/date_dim.dat\").createOrReplaceTempView(\"date_dim\")\n",
"print(\"-\"*50)"
]
},
Expand Down Expand Up @@ -497,7 +499,7 @@
],
"source": [
"start = time() \n",
"spark.read.parquet(dataRoot + \"/tpcds/customer\").limit(1000000).write.format(\"parquet\").mode(\"overwrite\").save(\"/data/tmp/customer1m\")\n",
"spark.read.parquet(dataRoot + \"/customer.dat\").limit(1000000).write.format(\"parquet\").mode(\"overwrite\").save(\"/data/tmp/customer1m\")\n",
"end = time()\n",
"# Parquet file scanning and writing will be about 3 times faster running on GPU\n",
"print(\"scanning and writing parquet cost : {} seconds\".format(round(end - start, 2)))\n",
Expand Down Expand Up @@ -557,8 +559,8 @@
"metadata": {},
"outputs": [],
"source": [
"spark.read.parquet(dataRoot + \"/tpcds/store_sales\").createOrReplaceTempView(\"store_sales\")\n",
"spark.read.parquet(dataRoot + \"/tpcds/store_returns\").createOrReplaceTempView(\"store_returns\")\n",
"spark.read.parquet(dataRoot + \"/store_sales.dat\").createOrReplaceTempView(\"store_sales\")\n",
"spark.read.parquet(dataRoot + \"/store_returns.dat\").createOrReplaceTempView(\"store_returns\")\n",
"\n",
"print(\"-\"*50)\n",
"query = '''\n",
Expand Down

0 comments on commit 27ed958

Please sign in to comment.