From 649a356a5c3b044061b607cde9f18ebf5316509d Mon Sep 17 00:00:00 2001
From: Ahmed Hussein <50450311+amahussein@users.noreply.github.com>
Date: Tue, 13 Feb 2024 11:48:28 -0600
Subject: [PATCH] Qualification should mark WriteFiles as supported (#784)

Signed-off-by: Ahmed Hussein (amahussein) <a@ahussein.me>

Fixes #783

- WriteFiles exec ius added in Spark3.4.0+
- In fact, the WriteFiles exec will fallback to CPU if the child
  operator is not supported on GPU.
- For simplicity of implementation, the Q tool considers WriteFiles Exec
  as supported all the time. This should be fine because the child
(actual write exec) is evaluated independently.
---
 .../operatorsScore-databricks-aws-t4.csv      |  1 +
 .../operatorsScore-databricks-azure-t4.csv    |  1 +
 .../operatorsScore-dataproc-gke-l4.csv        |  1 +
 .../operatorsScore-dataproc-gke-t4.csv        |  1 +
 .../resources/operatorsScore-dataproc-l4.csv  |  1 +
 .../operatorsScore-dataproc-serverless-l4.csv |  1 +
 .../resources/operatorsScore-dataproc-t4.csv  |  1 +
 .../main/resources/operatorsScore-emr-a10.csv |  1 +
 .../main/resources/operatorsScore-emr-t4.csv  |  1 +
 .../resources/operatorsScore-onprem-a100.csv  |  1 +
 .../tool/planparser/SQLPlanParser.scala       |  2 +
 .../planparser/WriteFilesExecParser.scala     | 56 +++++++++++++++++++
 .../nvidia/spark/rapids/BaseTestSuite.scala   |  7 ++-
 .../tool/planparser/SqlPlanParserSuite.scala  | 44 +++++++++++++++
 14 files changed, 118 insertions(+), 1 deletion(-)
 create mode 100644 core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/WriteFilesExecParser.scala

diff --git a/core/src/main/resources/operatorsScore-databricks-aws-t4.csv b/core/src/main/resources/operatorsScore-databricks-aws-t4.csv
index afd6848ce..4ce7087e7 100644
--- a/core/src/main/resources/operatorsScore-databricks-aws-t4.csv
+++ b/core/src/main/resources/operatorsScore-databricks-aws-t4.csv
@@ -260,6 +260,7 @@ WindowExpression,2.45
 WindowSpecDefinition,2.45
 XxHash64,2.45
 Year,2.45
+WriteFilesExec,2.45
 AggregateInPandasExec,1.2
 ArrowEvalPythonExec,1.2
 FlatMapGroupsInPandasExec,1.2
diff --git a/core/src/main/resources/operatorsScore-databricks-azure-t4.csv b/core/src/main/resources/operatorsScore-databricks-azure-t4.csv
index 6b7396c0d..0c7c65f51 100644
--- a/core/src/main/resources/operatorsScore-databricks-azure-t4.csv
+++ b/core/src/main/resources/operatorsScore-databricks-azure-t4.csv
@@ -260,6 +260,7 @@ WindowExpression,2.73
 WindowSpecDefinition,2.73
 XxHash64,2.73
 Year,2.73
+WriteFilesExec,2.73
 AggregateInPandasExec,1.2
 ArrowEvalPythonExec,1.2
 FlatMapGroupsInPandasExec,1.2
diff --git a/core/src/main/resources/operatorsScore-dataproc-gke-l4.csv b/core/src/main/resources/operatorsScore-dataproc-gke-l4.csv
index 204866e8a..dbbfec22f 100644
--- a/core/src/main/resources/operatorsScore-dataproc-gke-l4.csv
+++ b/core/src/main/resources/operatorsScore-dataproc-gke-l4.csv
@@ -253,6 +253,7 @@ WindowExpression,3.74
 WindowSpecDefinition,3.74
 XxHash64,3.74
 Year,3.74
+WriteFilesExec,3.74
 AggregateInPandasExec,1.2
 ArrowEvalPythonExec,1.2
 FlatMapGroupsInPandasExec,1.2
diff --git a/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv b/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv
index bf6395056..cd4bdb55e 100644
--- a/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv
+++ b/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv
@@ -253,6 +253,7 @@ WindowExpression,3.65
 WindowSpecDefinition,3.65
 XxHash64,3.65
 Year,3.65
+WriteFilesExec,3.65
 AggregateInPandasExec,1.2
 ArrowEvalPythonExec,1.2
 FlatMapGroupsInPandasExec,1.2
diff --git a/core/src/main/resources/operatorsScore-dataproc-l4.csv b/core/src/main/resources/operatorsScore-dataproc-l4.csv
index 251c67c9f..9c13792fe 100644
--- a/core/src/main/resources/operatorsScore-dataproc-l4.csv
+++ b/core/src/main/resources/operatorsScore-dataproc-l4.csv
@@ -260,6 +260,7 @@ WindowExpression,4.16
 WindowSpecDefinition,4.16
 XxHash64,4.16
 Year,4.16
+WriteFilesExec,4.16
 AggregateInPandasExec,1.2
 ArrowEvalPythonExec,1.2
 FlatMapGroupsInPandasExec,1.2
diff --git a/core/src/main/resources/operatorsScore-dataproc-serverless-l4.csv b/core/src/main/resources/operatorsScore-dataproc-serverless-l4.csv
index eb22ba760..6874994eb 100644
--- a/core/src/main/resources/operatorsScore-dataproc-serverless-l4.csv
+++ b/core/src/main/resources/operatorsScore-dataproc-serverless-l4.csv
@@ -253,6 +253,7 @@ WindowExpression,4.25
 WindowSpecDefinition,4.25
 XxHash64,4.25
 Year,4.25
+WriteFilesExec,4.25
 AggregateInPandasExec,1.2
 ArrowEvalPythonExec,1.2
 FlatMapGroupsInPandasExec,1.2
diff --git a/core/src/main/resources/operatorsScore-dataproc-t4.csv b/core/src/main/resources/operatorsScore-dataproc-t4.csv
index 789829e69..d0234e2fd 100644
--- a/core/src/main/resources/operatorsScore-dataproc-t4.csv
+++ b/core/src/main/resources/operatorsScore-dataproc-t4.csv
@@ -260,6 +260,7 @@ WindowExpression,4.88
 WindowSpecDefinition,4.88
 XxHash64,4.88
 Year,4.88
+WriteFilesExec,4.88
 AggregateInPandasExec,1.2
 ArrowEvalPythonExec,1.2
 FlatMapGroupsInPandasExec,1.2
diff --git a/core/src/main/resources/operatorsScore-emr-a10.csv b/core/src/main/resources/operatorsScore-emr-a10.csv
index 600614084..9c48cc9a4 100644
--- a/core/src/main/resources/operatorsScore-emr-a10.csv
+++ b/core/src/main/resources/operatorsScore-emr-a10.csv
@@ -260,6 +260,7 @@ WindowExpression,2.59
 WindowSpecDefinition,2.59
 XxHash64,2.59
 Year,2.59
+WriteFilesExec,2.59
 AggregateInPandasExec,1.2
 ArrowEvalPythonExec,1.2
 FlatMapGroupsInPandasExec,1.2
diff --git a/core/src/main/resources/operatorsScore-emr-t4.csv b/core/src/main/resources/operatorsScore-emr-t4.csv
index bf4c818ae..503b05994 100644
--- a/core/src/main/resources/operatorsScore-emr-t4.csv
+++ b/core/src/main/resources/operatorsScore-emr-t4.csv
@@ -260,6 +260,7 @@ WindowExpression,2.07
 WindowSpecDefinition,2.07
 XxHash64,2.07
 Year,2.07
+WriteFilesExec,2.07
 AggregateInPandasExec,1.2
 ArrowEvalPythonExec,1.2
 FlatMapGroupsInPandasExec,1.2
diff --git a/core/src/main/resources/operatorsScore-onprem-a100.csv b/core/src/main/resources/operatorsScore-onprem-a100.csv
index d5638c96e..f4367f388 100644
--- a/core/src/main/resources/operatorsScore-onprem-a100.csv
+++ b/core/src/main/resources/operatorsScore-onprem-a100.csv
@@ -265,6 +265,7 @@ WindowExpression,4
 WindowSpecDefinition,4
 XxHash64,4
 Year,4
+WriteFilesExec,4
 KMeans-pyspark,8.86
 KMeans-scala,1
 PCA-pyspark,2.24
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala
index e8adb1ff9..683ed18c1 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala
@@ -496,6 +496,8 @@ object SQLPlanParser extends Logging {
             WindowExecParser(node, checker, sqlID).parse
           case "WindowInPandas" =>
             WindowInPandasExecParser(node, checker, sqlID).parse
+          case wfe if WriteFilesExecParser.accepts(wfe) =>
+            WriteFilesExecParser(node, checker, sqlID).parse
           case _ =>
             // Execs that are members of reuseExecs (i.e., ReusedExchange) should be marked as
             // supported but with shouldRemove flag set to True.
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/WriteFilesExecParser.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/WriteFilesExecParser.scala
new file mode 100644
index 000000000..bcd2272e0
--- /dev/null
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/WriteFilesExecParser.scala
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.tool.planparser
+
+import com.nvidia.spark.rapids.tool.qualification.PluginTypeChecker
+
+import org.apache.spark.sql.execution.ui.SparkPlanGraphNode
+
+case class WriteFilesExecParser(
+    node: SparkPlanGraphNode,
+    checker: PluginTypeChecker,
+    sqlID: Long) extends ExecParser {
+  // WriteFiles was added in Spark3.4+.
+  // The purpose was to create that operator to contain information from V1 write operators.
+  // Basically, it is supported IFF its child is supported. The GPU plan will fallBack to the CPU
+  // if the child is not supported.
+  // For Q tool, we will treat the WriteFilesExec as supported regardless of the child.
+  // Then the child is evaluated on its own . This results in the WriteFilesExec being incorrectly
+  // marked as supported, but the error is should not a big deal since the operator has no
+  // duration associated with it.
+  override val fullExecName: String = WriteFilesExecParser.execName + "Exec"
+
+  override def parse: ExecInfo = {
+    // the WriteFiles does not have duration
+    val duration = None
+    val speedupFactor = checker.getSpeedupFactor(fullExecName)
+    ExecInfo.createExecNoNode(
+      sqlID,
+      WriteFilesExecParser.execName,
+      "",
+      speedupFactor,
+      duration,
+      node.id, opType = OpTypes.WriteExec, true, None)
+  }
+}
+
+object WriteFilesExecParser {
+  val execName = "WriteFiles"
+  def accepts(nodeName: String): Boolean = {
+    nodeName.contains(execName)
+  }
+}
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/BaseTestSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/BaseTestSuite.scala
index 6f18946f3..ef7192a97 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/BaseTestSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/BaseTestSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -73,6 +73,11 @@ class BaseTestSuite extends FunSuite with BeforeAndAfterEach with Logging {
       "Spark340+ does not support the expression")
   }
 
+  protected def execsSupportedSparkGTE340(): (Boolean, String) = {
+    (ToolUtils.isSpark340OrLater(),
+      "Spark340+ supports the Exec/Expression")
+  }
+
   def runConditionalTest(testName: String, assumeCondition: () => (Boolean, String))
     (fun: => Unit): Unit = {
     val (isAllowed, ignoreMessage) = assumeCondition()
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala
index dfba232f4..3a451973f 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala
@@ -366,6 +366,50 @@ class SQLPlanParserSuite extends BaseTestSuite {
     }
   }
 
+  runConditionalTest("WriteFilesExec is marked as Supported",
+    execsSupportedSparkGTE340) {
+    val dataWriteCMD = DataWritingCommandExecParser.insertIntoHadoopCMD
+    TrampolineUtil.withTempDir { outputLoc =>
+      TrampolineUtil.withTempDir { eventLogDir =>
+        val (eventLog, _) = ToolTestUtils.generateEventLog(eventLogDir, dataWriteCMD) { spark =>
+          import spark.implicits._
+          val df = spark.sparkContext.makeRDD(1 to 10000, 6).toDF
+          val dfWithStrings = df.select(col("value").cast("string"))
+          dfWithStrings.write.text(s"$outputLoc/testtext")
+          df.write.parquet(s"$outputLoc/testparquet")
+          df.write.orc(s"$outputLoc/testorc")
+          df.write.json(s"$outputLoc/testjson")
+          df.write.csv(s"$outputLoc/testcsv")
+          df
+        }
+        val pluginTypeChecker = new PluginTypeChecker()
+        val app = createAppFromEventlog(eventLog)
+        assert(app.sqlPlans.size == 6)
+        val parsedPlans = app.sqlPlans.map { case (sqlID, plan) =>
+          SQLPlanParser.parseSQLPlan(app.appId, plan, sqlID, "", pluginTypeChecker, app)
+        }
+        val allExecInfo = getAllExecsFromPlan(parsedPlans.toSeq)
+        val writeExecs = allExecInfo.filter(_.exec.contains(s"$dataWriteCMD"))
+        val text = writeExecs.filter(_.expr.contains("text"))
+        val json = writeExecs.filter(_.expr.contains("json"))
+        val orc = writeExecs.filter(_.expr.contains("orc"))
+        val parquet = writeExecs.filter(_.expr.contains("parquet"))
+        val csv = writeExecs.filter(_.expr.contains("csv"))
+        for (t <- Seq(json, csv, text)) {
+          assertSizeAndNotSupported(1, t)
+        }
+        for (t <- Seq(orc, parquet)) {
+          assertSizeAndSupported(1, t)
+        }
+        // For Spark.3.4.0+, the new operator WriteFilesExec is added.
+        // Q tool handles this operator as supported regardless of the type of the exec operation.
+        val writeFileExecs = allExecInfo.filter(_.exec.contains(WriteFilesExecParser.execName))
+        // we have 5 write operations, so we should have 5 WriteFilesExec.
+        assertSizeAndSupported(5, writeFileExecs)
+      }
+    }
+  }
+
   test("CreateDataSourceTableAsSelectCommand") {
     // using event log to not deal with enabling hive support
     val eventLog = s"$qualLogDir/createdatasourcetable_eventlog.zstd"