NVIDIA · cindyyuanjiang · Nov 22, 2024 · Oct 2, 2024 · Oct 4, 2024 · Oct 9, 2024
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AggRawMetricsResult.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AggRawMetricsResult.scala
@@ -16,7 +16,7 @@
 
 package com.nvidia.spark.rapids.tool.analysis
 
-import com.nvidia.spark.rapids.tool.profiling.{IOAnalysisProfileResult, JobAggTaskMetricsProfileResult, ShuffleSkewProfileResult, SQLDurationExecutorTimeProfileResult, SQLMaxTaskInputSizes, SQLTaskAggMetricsProfileResult, StageAggTaskMetricsProfileResult}
+import com.nvidia.spark.rapids.tool.profiling.{IOAnalysisProfileResult, JobAggTaskMetricsProfileResult, ShuffleSkewProfileResult, SQLDurationExecutorTimeProfileResult, SQLMaxTaskInputSizes, SQLTaskAggMetricsProfileResult, StageAggTaskMetricsProfileResult, StageDiagnosticMetricsProfileResult}
 
 /**
  * The result of the aggregation of the raw metrics. It contains the aggregated metrics for an
@@ -32,6 +32,7 @@ import com.nvidia.spark.rapids.tool.profiling.{IOAnalysisProfileResult, JobAggTa
  * @param ioAggs            lists the SQLs along their IO metrics
  * @param sqlDurAggs        the aggregated duration and CPU time for SQLs
  * @param maxTaskInputSizes a sequence of SQLMaxTaskInputSizes that contains the maximum input size
+ * @param stageDiagnostics  the stage level Spark metrics for diagnostic purposes
  */
 case class AggRawMetricsResult(
     jobAggs: Seq[JobAggTaskMetricsProfileResult],
@@ -40,4 +41,5 @@ case class AggRawMetricsResult(
     sqlAggs: Seq[SQLTaskAggMetricsProfileResult],
     ioAggs: Seq[IOAnalysisProfileResult],
     sqlDurAggs: Seq[SQLDurationExecutorTimeProfileResult],
-    maxTaskInputSizes: Seq[SQLMaxTaskInputSizes])
+    maxTaskInputSizes: Seq[SQLMaxTaskInputSizes],
+    stageDiagnostics: Seq[StageDiagnosticMetricsProfileResult])
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSQLPlanAnalyzer.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSQLPlanAnalyzer.scala
@@ -44,6 +44,7 @@ import org.apache.spark.sql.rapids.tool.util.ToolsPlanGraph
  * @param app the Application info objects that contains the SQL plans to be processed
  */
 class AppSQLPlanAnalyzer(app: AppBase, appIndex: Int) extends AppAnalysisBase(app) {
+  val GPU_SEMAPHORE_WAIT_METRIC_NAME = "gpuSemaphoreWait"
   // A map between (SQL ID, Node ID) and the set of stage IDs
   // TODO: The Qualification should use this map instead of building a new set for each exec.
   private val sqlPlanNodeIdToStageIds: HashMap[(Long, Long), Set[Int]] =
@@ -56,6 +57,10 @@ class AppSQLPlanAnalyzer(app: AppBase, appIndex: Int) extends AppAnalysisBase(ap
   //      SQLPlanParser.
   var unsupportedSQLPlan: ArrayBuffer[UnsupportedSQLPlan] = ArrayBuffer[UnsupportedSQLPlan]()
   var allSQLMetrics: ArrayBuffer[SQLMetricInfoCase] = ArrayBuffer[SQLMetricInfoCase]()
+  // A map between stage ID and the set of node names
+  val stageToNodeNames: HashMap[Long, Seq[String]] = HashMap.empty[Long, Seq[String]]
+  // A map between stage ID and total GPU semaphore wait time
+  val stageToGpuSemaphoreWaitTime: HashMap[Long, Long] = HashMap.empty[Long, Long]
 
   /**
    * Connects Operators to Stages using AccumulatorIDs.
@@ -261,6 +266,7 @@ class AppSQLPlanAnalyzer(app: AppBase, appIndex: Int) extends AppAnalysisBase(ap
           }
           validNodes.map(n => s"${n.name}(${n.id.toString})")
         }.getOrElse(Seq.empty)
+        stageToNodeNames(sModel.stageInfo.stageId) = nodeNames
         SQLStageInfoProfileResult(appIndex, j.sqlID.get, jobId, sModel.stageInfo.stageId,
           sModel.stageInfo.attemptNumber(), sModel.duration, nodeNames)
       }
@@ -339,6 +345,9 @@ class AppSQLPlanAnalyzer(app: AppBase, appIndex: Int) extends AppAnalysisBase(ap
           } else {
             taskUpatesSubset(taskUpatesSubset.size / 2)
           }
+          if (accumInfo.infoRef.getName.contains(GPU_SEMAPHORE_WAIT_METRIC_NAME)) {
+            stageToGpuSemaphoreWaitTime(stageId) = sum
+          }
           Some(AccumProfileResults(
             appIndex,
             stageId,

diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAggTrait.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAggTrait.scala
@@ -29,7 +29,8 @@ trait AppSparkMetricsAggTrait extends AppIndexMapperTrait {
    * @return a single record of AggRawMetricsResult containing all the raw aggregated Spark
    *         metrics
    */
-  def getAggRawMetrics(app: AppBase, index: Int): AggRawMetricsResult = {
+  def getAggRawMetrics(app: AppBase, index: Int, sqlAnalyzer: Option[AppSQLPlanAnalyzer] = None):
+      AggRawMetricsResult = {
     val analysisObj = new AppSparkMetricsAnalyzer(app)
     AggRawMetricsResult(
       analysisObj.aggregateSparkMetricsByJob(index),
@@ -38,7 +39,8 @@ trait AppSparkMetricsAggTrait extends AppIndexMapperTrait {
       analysisObj.aggregateSparkMetricsBySql(index),
       analysisObj.aggregateIOMetricsBySql(analysisObj.aggregateSparkMetricsBySql(index)),
       analysisObj.aggregateDurationAndCPUTimeBySql(index),
-      Seq(analysisObj.maxTaskInputSizeBytesPerSQL(index)))
+      Seq(analysisObj.maxTaskInputSizeBytesPerSQL(index)),
+      analysisObj.aggregateDiagnosticSparkMetricsByStage(index, sqlAnalyzer))
   }
 
   /**
@@ -59,7 +61,8 @@ trait AppSparkMetricsAggTrait extends AppIndexMapperTrait {
         agg1.sqlAggs ++ agg2.sqlAggs,
         agg1.ioAggs ++ agg2.ioAggs,
         agg1.sqlDurAggs ++ agg2.sqlDurAggs,
-        agg1.maxTaskInputSizes ++ agg2.maxTaskInputSizes)
+        agg1.maxTaskInputSizes ++ agg2.maxTaskInputSizes,
+        agg1.stageDiagnostics ++ agg2.stageDiagnostics)
     }
   }
 }
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAnalyzer.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAnalyzer.scala
@@ -21,9 +21,10 @@ import java.util.concurrent.TimeUnit
 import scala.collection.mutable
 
 import com.nvidia.spark.rapids.tool.planparser.DatabricksParseHelper
-import com.nvidia.spark.rapids.tool.profiling.{IOAnalysisProfileResult, JobAggTaskMetricsProfileResult, ShuffleSkewProfileResult, SQLDurationExecutorTimeProfileResult, SQLMaxTaskInputSizes, SQLTaskAggMetricsProfileResult, StageAggTaskMetricsProfileResult}
+import com.nvidia.spark.rapids.tool.profiling.{IOAnalysisProfileResult, JobAggTaskMetricsProfileResult, ShuffleSkewProfileResult, SQLDurationExecutorTimeProfileResult, SQLMaxTaskInputSizes, SQLTaskAggMetricsProfileResult, StageAggTaskMetricsProfileResult, StageDiagnosticMetricsProfileResult}
 
 import org.apache.spark.sql.rapids.tool.{AppBase, ToolUtils}
+import org.apache.spark.sql.rapids.tool.profiling.ApplicationInfo
 import org.apache.spark.sql.rapids.tool.store.{AccumInfo, TaskModel}
 
 /**
@@ -320,6 +321,89 @@ class AppSparkMetricsAnalyzer(app: AppBase) extends AppAnalysisBase(app) {
     sqlRows.toSeq
   }
 
+  /**
+   * Aggregates the diagnostic SparkMetrics by stage.
+   * @param index the App-index (used by the profiler tool)
+   * @return sequence of StageDiagnosticAggTaskMetricsProfileResult
+   */
+  def aggregateDiagnosticSparkMetricsByStage(index: Int,
+      analyzerInput: Option[AppSQLPlanAnalyzer] = None):
+        Seq[StageDiagnosticMetricsProfileResult] = {
+    def bytesToMB(numBytes: Long): Long = numBytes / (1024 * 1024)
+    val sqlAnalyzer = analyzerInput match {
+      case Some(res) => res
+      case None => app.asInstanceOf[ApplicationInfo].planMetricProcessor
+    }
+    // TODO: this has stage attempts. we should handle different attempts
+    app.stageManager.getAllStages.map { sm =>
+      // TODO: Should we only consider successful tasks?
+      val tasksInStage = app.taskManager.getTasks(sm.stageInfo.stageId,
+        sm.stageInfo.attemptNumber())
+      // count duplicate task attempts
+      val numAttempts = tasksInStage.size
+      val (diskSpilledMin, diskSpilledMed, diskSpilledMax, diskSpilledSum) =
+        AppSparkMetricsAnalyzer.getStatistics(tasksInStage.map(_.diskBytesSpilled))
+      val (memSpilledMin, memSpilledMed, memSpilledMax, memSpilledSum) =
+        AppSparkMetricsAnalyzer.getStatistics(tasksInStage.map(_.memoryBytesSpilled))
+      val (inputBytesMin, inputBytesMed, inputBytesMax, inputBytesSum) =
+        AppSparkMetricsAnalyzer.getStatistics(tasksInStage.map(_.input_bytesRead))
+      val (ouputBytesMin, ouputBytesMed, ouputBytesMax, ouputBytesSum) =
+        AppSparkMetricsAnalyzer.getStatistics(tasksInStage.map(_.output_bytesWritten))
+      val (srBytesMin, srBytesMed, srBytesMax, srBytesSum) =
+        AppSparkMetricsAnalyzer.getStatistics(tasksInStage.map(_.sr_totalBytesRead))
+      val (swBytesMin, swBytesMed, swBytesMax, swBytesSum) =
+        AppSparkMetricsAnalyzer.getStatistics(tasksInStage.map(_.sw_bytesWritten))
+      val (srFetchWaitTimeMin, srFetchWaitTimeMed, srFetchWaitTimeMax, srFetchWaitTimeSum) =
+        AppSparkMetricsAnalyzer.getStatistics(tasksInStage.map(_.sr_fetchWaitTime))
+      val (swWriteTimeMin, swWriteTimeMed, swWriteTimeMax, swWriteTimeSum) =
+        AppSparkMetricsAnalyzer.getStatistics(tasksInStage.map(_.sw_writeTime))
+      val nodeNames = sqlAnalyzer.stageToNodeNames.
+        getOrElse(sm.stageInfo.stageId, Seq.empty[String])
+      val gpuSemaphoreWaitSum = sqlAnalyzer.stageToGpuSemaphoreWaitTime.
+        getOrElse(sm.stageInfo.stageId, 0L)
+      StageDiagnosticMetricsProfileResult(index,
+        app.getAppName,
+        app.appId,
+        sm.stageInfo.stageId,
+        sm.duration,
+        numAttempts,  // TODO: why is this numAttempts and not numTasks?
+        bytesToMB(memSpilledMin),
+        bytesToMB(memSpilledMed),
+        bytesToMB(memSpilledMax),
+        bytesToMB(memSpilledSum),
+        bytesToMB(diskSpilledMin),
+        bytesToMB(diskSpilledMed),
+        bytesToMB(diskSpilledMax),
+        bytesToMB(diskSpilledSum),
+        inputBytesMin,
+        inputBytesMed,
+        inputBytesMax,
+        inputBytesSum,
+        ouputBytesMin,
+        ouputBytesMed,
+        ouputBytesMax,
+        ouputBytesSum,
+        srBytesMin,
+        srBytesMed,
+        srBytesMax,
+        srBytesSum,
+        swBytesMin,
+        swBytesMed,
+        swBytesMax,
+        swBytesSum,
+        srFetchWaitTimeMin,
+        srFetchWaitTimeMed,
+        srFetchWaitTimeMax,
+        srFetchWaitTimeSum,
+        swWriteTimeMin,
+        swWriteTimeMed,
+        swWriteTimeMax,
+        swWriteTimeSum,
+        gpuSemaphoreWaitSum,
+        nodeNames)
+    }.toSeq
+  }
+
   /**
    * Aggregates the SparkMetrics by stage. This is an internal method to populate the cached metrics
    * to be used by other aggregators.
@@ -434,6 +518,23 @@ object AppSparkMetricsAnalyzer  {
     }
   }
 
+  /**
+   * Given an input iterable, returns its min, median, max and sum.
+   */
+  def getStatistics(arr: Iterable[Long]): (Long, Long, Long, Long) = {
+    if (arr.isEmpty) {
+      (0L, 0L, 0L, 0L)
+    }
+    val sortedArr = arr.toSeq.sorted
+    val len = sortedArr.size
+    val med = if (len % 2 == 0) {
+      (sortedArr(len / 2) + sortedArr(len / 2 - 1)) / 2
+    } else {
+      sortedArr(len / 2)
+    }
+    (sortedArr.head, med, sortedArr(len - 1), sortedArr.sum)
+  }
+
   def maxWithEmptyHandling(arr: Iterable[Long]): Long = {
     if (arr.isEmpty) {
       0L

diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala
@@ -47,7 +47,8 @@ case class ApplicationSummaryInfo(
     ioMetrics: Seq[IOAnalysisProfileResult],
     sysProps: Seq[RapidsPropertyProfileResult],
     sqlCleanedAlignedIds: Seq[SQLCleanAndAlignIdsProfileResult],
-    sparkRapidsBuildInfo: Seq[SparkRapidsBuildInfoEvent])
+    sparkRapidsBuildInfo: Seq[SparkRapidsBuildInfoEvent],
+    stageDiagnostics: Seq[StageDiagnosticMetricsProfileResult])
 
 trait AppInfoPropertyGetter {
   // returns all the properties (i.e., spark)