Skip to content

Commit

Permalink
addressed review comments and updated test results
Browse files Browse the repository at this point in the history
Signed-off-by: Niranjan Artal <[email protected]>
  • Loading branch information
nartal1 committed Aug 28, 2023
1 parent 7b6803f commit 2607489
Show file tree
Hide file tree
Showing 8 changed files with 37 additions and 59 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ class QualificationAppInfo(
val stageIdToTaskEndSum: HashMap[Long, StageTaskQualificationSummary] =
HashMap.empty[Long, StageTaskQualificationSummary]
val stageIdToGpuCpuTransitions: HashMap[Int, Int] = HashMap.empty[Int, Int]
var execsNoStageTransitions: Int = 0

val stageIdToSqlID: HashMap[Int, Long] = HashMap.empty[Int, Long]
val sqlIDtoFailures: HashMap[Long, ArrayBuffer[String]] = HashMap.empty[Long, ArrayBuffer[String]]
Expand Down Expand Up @@ -152,8 +151,10 @@ class QualificationAppInfo(

// Look at the total task times for all jobs/stages that aren't SQL or
// SQL but dataset or rdd
private def calculateNonSQLTaskDataframeDuration(taskDFDuration: Long): Long = {
val allTaskTime = stageIdToTaskEndSum.values.map(_.totalTaskDuration).sum
private def calculateNonSQLTaskDataframeDuration(
taskDFDuration: Long,
totalTransitionTime: Long): Long = {
val allTaskTime = stageIdToTaskEndSum.values.map(_.totalTaskDuration).sum + totalTransitionTime
val res = allTaskTime - taskDFDuration
assert(res >= 0)
res
Expand All @@ -166,11 +167,11 @@ class QualificationAppInfo(
}

private def calculateSQLSupportedTaskDuration(all: Seq[StageQualSummaryInfo]): Long = {
all.map(s => s.stageTaskTime - s.unsupportedTaskDur).sum - calculateNoExecsStageDurations(all)
all.map(s => s.stageTaskTime - s.unsupportedTaskDur).sum
}

private def calculateSQLUnsupportedTaskDuration(all: Seq[StageQualSummaryInfo]): Long = {
all.map(_.unsupportedTaskDur).sum + calculateNoExecsStageDurations(all)
all.map(_.unsupportedTaskDur).sum
}

private def calculateSpeedupFactor(all: Seq[StageQualSummaryInfo]): Double = {
Expand All @@ -179,23 +180,6 @@ class QualificationAppInfo(
res
}

private def calculateNoExecsStageDurations(all: Seq[StageQualSummaryInfo]): Long = {
// If there are Execs not associated with any stage, then some of the Execs may not be
// supported on GPU. We need to estimate the duration of these Execs and add it to the
// unsupportedTaskDur. We estimate the duration by taking the average of the unsupportedTaskDur
// of all the stages and multiplying it by the number of Execs that are not associated with
// any stage. We multiply with a penalty factor of 0.05
// TODO: Need to come up with better heuristics for penalty factor.
val unsupportedTasksize= all.map(_.unsupportedTaskDur).size
if (execsNoStageTransitions != 0 && unsupportedTasksize != 0) {
execsNoStageTransitions * (
all.map(_.unsupportedTaskDur).sum / unsupportedTasksize) * 0.05
}.toLong
else {
0L
}
}

private def getAllReadFileFormats: Seq[String] = {
dataSourceInfo.map { ds =>
s"${ds.format.toLowerCase()}[${ds.schema}]"
Expand Down Expand Up @@ -267,7 +251,6 @@ class QualificationAppInfo(
case false => stageIdToGpuCpuTransitions.getOrElse(stageId, 0)
case true => 0
}
// val numTransitions = stageIdToGpuCpuTransitions.getOrElse(stageId, 0)
val transitionsTime = numTransitions match {
case 0 => 0L // no transitions
case gpuCpuTransitions if gpuCpuTransitions > 0 =>
Expand All @@ -286,13 +269,8 @@ class QualificationAppInfo(
}
case _ => 0L
}
// Update totaltaskduration of stageIdToTaskEndSum to include transitions time
val stageIdToTasksMetrics = stageIdToTaskEndSum.get(stageId).orElse(None)
if (stageIdToTasksMetrics.isDefined) {
stageIdToTasksMetrics.get.totalTaskDuration += transitionsTime
}
StageQualSummaryInfo(stageId, allSpeedupFactorAvg, stageTaskTime,
eachStageUnsupported + transitionsTime, estimated, numTransitions)
eachStageUnsupported + transitionsTime, numTransitions, transitionsTime, estimated)
}.toSet
}

Expand Down Expand Up @@ -333,9 +311,6 @@ class QualificationAppInfo(
}
stageIdToGpuCpuTransitions(stageId) = transitions
}
if (execsNoStage.nonEmpty) {
execsNoStageTransitions += execsNoStage.filterNot(exec => exec.isSupported).size
}
if (allStagesToExecs.isEmpty) {
// use job level
// also get the job ids associated with the SQLId
Expand Down Expand Up @@ -516,11 +491,12 @@ class QualificationAppInfo(
val allStagesSummary = perSqlStageSummary.flatMap(_.stageSum)
.map(sum => sum.stageId -> sum).toMap.values.toSeq
val sqlDataframeTaskDuration = allStagesSummary.map(s => s.stageTaskTime).sum
val totalTransitionsTime = allStagesSummary.map(s=> s.transitionTime).sum
val unsupportedSQLTaskDuration = calculateSQLUnsupportedTaskDuration(allStagesSummary)
val endDurationEstimated = this.appEndTime.isEmpty && appDuration > 0
val jobOverheadTime = calculateJobOverHeadTime(info.startTime)
val nonSQLDataframeTaskDuration =
calculateNonSQLTaskDataframeDuration(sqlDataframeTaskDuration)
calculateNonSQLTaskDataframeDuration(sqlDataframeTaskDuration, totalTransitionsTime)
val nonSQLTaskDuration = nonSQLDataframeTaskDuration + jobOverheadTime
// note that these ratios are based off the stage times which may be missing some stage
// overhead or execs that didn't have associated stages
Expand Down Expand Up @@ -577,7 +553,8 @@ class QualificationAppInfo(

// get the ratio based on the Task durations that we will use for wall clock durations
val estimatedGPURatio = if (sqlDataframeTaskDuration > 0) {
supportedSQLTaskDuration.toDouble / sqlDataframeTaskDuration.toDouble
supportedSQLTaskDuration.toDouble / (
sqlDataframeTaskDuration.toDouble + totalTransitionsTime.toDouble)
} else {
1
}
Expand Down Expand Up @@ -825,8 +802,9 @@ case class StageQualSummaryInfo(
averageSpeedup: Double,
stageTaskTime: Long,
unsupportedTaskDur: Long,
estimated: Boolean = false,
numTransitions: Int)
numTransitions: Int,
transitionTime: Long,
estimated: Boolean = false)

object QualificationAppInfo extends Logging {
// define recommendation constants
Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
App Name,App ID,Recommendation,Estimated GPU Speedup,Estimated GPU Duration,Estimated GPU Time Saved,SQL DF Duration,SQL Dataframe Task Duration,App Duration,GPU Opportunity,Executor CPU Time Percent,SQL Ids with Failures,Unsupported Read File Formats and Types,Unsupported Write Data Format,Complex Types,Nested Complex Types,Potential Problems,Longest SQL Duration,NONSQL Task Duration Plus Overhead,Unsupported Task Duration,Supported SQL DF Task Duration,Task Speedup Factor,App Duration Estimated,Unsupported Execs,Unsupported Expressions,Estimated Job Frequency (monthly)
"Spark shell","app-20211019113801-0001","Not Recommended",1.0,569387.57,2579.42,3627,19894,571967,3500,28.41,"","JDBC[*]","","","","",1812,544575,693,19201,3.8,false,"Scan JDBCRelation(TBLS) [numPartitions=1];Execute CreateViewCommand;CollectLimit","",30
"Spark shell","app-20211019113801-0001","Not Recommended",1.0,569385.42,2581.57,3627,19894,571967,3503,28.41,"","JDBC[*]","","","","",1812,544575,677,19217,3.8,false,"Scan JDBCRelation(TBLS) [numPartitions=1];Execute CreateViewCommand;CollectLimit","",30
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
App Name,App ID,Recommendation,Estimated GPU Speedup,Estimated GPU Duration,Estimated GPU Time Saved,SQL DF Duration,SQL Dataframe Task Duration,App Duration,GPU Opportunity,Executor CPU Time Percent,SQL Ids with Failures,Unsupported Read File Formats and Types,Unsupported Write Data Format,Complex Types,Nested Complex Types,Potential Problems,Longest SQL Duration,NONSQL Task Duration Plus Overhead,Unsupported Task Duration,Supported SQL DF Task Duration,Task Speedup Factor,App Duration Estimated,Unsupported Execs,Unsupported Expressions,Estimated Job Frequency (monthly)
"Rapids Spark Profiling Tool Unit Tests","local-1622043423018","Recommended",1.92,8477.87,7841.12,12434,132257,16319,10582,37.7,"","","JSON","","","",7143,4717,19691,112566,3.86,false,"SerializeFromObject;Execute InsertIntoHadoopFsRelationCommand json;DeserializeToObject;Filter;MapElements;Scan","",1
"Spark shell","local-1651187225439","Not Recommended",1.0,355490.83,146.16,760,180,355637,333,87.88,"","JSON[string:bigint:int]","","","","",498,343411,101,79,1.78,false,"SerializeFromObject;CollectLimit;DeserializeToObject;Scan json;Filter;MapElements","",1
"Spark shell","local-1651188809790","Not Recommended",1.0,166213.92,1.07,911,283,166215,3,81.18,"","JSON[string:bigint:int]","","","","UDF",715,133608,282,1,1.5,false,"CollectLimit;Scan json;Project","UDF",1
"Rapids Spark Profiling Tool Unit Tests","local-1623281204390","Not Recommended",1.0,6240.0,0.0,2032,4666,6240,-151,46.27,"","JSON[string:bigint:int]","JSON","","","UDF",1209,5793,5013,-347,1.0,false,"Execute InsertIntoHadoopFsRelationCommand json;LocalTableScan;Project;Scan json;Execute CreateViewCommand","UDF",1
"Rapids Spark Profiling Tool Unit Tests","local-1622043423018","Recommended",1.92,8472.65,7846.34,12434,132257,16319,10589,37.7,"","","JSON","","","",7143,4717,19616,112641,3.86,false,"SerializeFromObject;Execute InsertIntoHadoopFsRelationCommand json;DeserializeToObject;Filter;MapElements;Scan","",1
"Spark shell","local-1651187225439","Not Recommended",1.0,355483.43,153.56,760,180,355637,350,87.88,"","JSON[string:bigint:int]","","","","",498,343411,97,83,1.78,false,"SerializeFromObject;CollectLimit;DeserializeToObject;Scan json;Filter;MapElements","",1
"Spark shell","local-1651188809790","Not Recommended",1.0,166199.97,15.02,911,283,166215,45,81.18,"","JSON[string:bigint:int]","","","","UDF",715,133608,269,14,1.5,false,"CollectLimit;Scan json;Project","UDF",1
"Rapids Spark Profiling Tool Unit Tests","local-1623281204390","Not Recommended",1.0,6240.0,0.0,2032,4666,6240,0,46.27,"","JSON[string:bigint:int]","JSON","","","UDF",1209,5793,4664,2,1.0,false,"Execute InsertIntoHadoopFsRelationCommand json;LocalTableScan;Project;Scan json;Execute CreateViewCommand","UDF",1
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
App Name,App ID,SQL ID,SQL Description,SQL DF Duration,GPU Opportunity,Estimated GPU Duration,Estimated GPU Speedup,Estimated GPU Time Saved,Recommendation
"Rapids Spark Profiling Tool Unit Tests","local-1622043423018",1,"count at QualificationInfoUtils.scala:94",7143,6714,2082.44,3.43,5060.55,"Strongly Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1622043423018",3,"count at QualificationInfoUtils.scala:94",2052,1655,804.22,2.55,1247.77,"Strongly Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1622043423018",2,"count at QualificationInfoUtils.scala:94",1933,1546,767.5,2.51,1165.49,"Strongly Recommended"
"Spark shell","local-1651188809790",1,"show at <console>:26",196,90,150.76,1.3,45.23,"Recommended"
"Spark shell","local-1651187225439",0,"show at <console>:26",498,226,384.81,1.29,113.18,"Not Recommended"
"Spark shell","local-1651187225439",1,"show at <console>:26",262,40,247.69,1.05,14.3,"Not Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1622043423018",0,"json at QualificationInfoUtils.scala:76",1306,131,1264.57,1.03,41.42,"Not Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",0,"json at QualificationInfoUtils.scala:130",1209,-543,1209.0,1.0,0.0,"Not Recommended"
"Spark shell","local-1651188809790",0,"show at <console>:26",715,-66,715.0,1.0,0.0,"Not Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",2,"json at QualificationInfoUtils.scala:136",321,-144,321.0,1.0,0.0,"Not Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",5,"json at QualificationInfoUtils.scala:136",129,-57,129.0,1.0,0.0,"Not Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",8,"json at QualificationInfoUtils.scala:136",127,-56,127.0,1.0,0.0,"Not Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",3,"json at QualificationInfoUtils.scala:130",108,-48,108.0,1.0,0.0,"Not Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1622043423018",1,"count at QualificationInfoUtils.scala:94",7143,6719,2078.49,3.43,5064.5,"Strongly Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1622043423018",3,"count at QualificationInfoUtils.scala:94",2052,1660,800.56,2.56,1251.43,"Strongly Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1622043423018",2,"count at QualificationInfoUtils.scala:94",1933,1551,763.96,2.53,1169.03,"Strongly Recommended"
"Spark shell","local-1651187225439",0,"show at <console>:26",498,249,373.5,1.33,124.5,"Recommended"
"Spark shell","local-1651188809790",1,"show at <console>:26",196,98,147.0,1.33,49.0,"Recommended"
"Spark shell","local-1651187225439",1,"show at <console>:26",262,60,240.54,1.08,21.45,"Not Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1622043423018",0,"json at QualificationInfoUtils.scala:76",1306,187,1246.97,1.04,59.02,"Not Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",0,"json at QualificationInfoUtils.scala:130",1209,0,1209.0,1.0,0.0,"Not Recommended"
"Spark shell","local-1651188809790",0,"show at <console>:26",715,2,715.0,1.0,0.0,"Not Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",2,"json at QualificationInfoUtils.scala:136",321,0,321.0,1.0,0.0,"Not Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",5,"json at QualificationInfoUtils.scala:136",129,0,129.0,1.0,0.0,"Not Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",8,"json at QualificationInfoUtils.scala:136",127,0,127.0,1.0,0.0,"Not Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",6,"json at QualificationInfoUtils.scala:130",110,0,110.0,1.0,0.0,"Not Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",3,"json at QualificationInfoUtils.scala:130",108,0,108.0,1.0,0.0,"Not Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",4,"createOrReplaceTempView at QualificationInfoUtils.scala:133",22,22,22.0,1.0,0.0,"Not Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",7,"createOrReplaceTempView at QualificationInfoUtils.scala:133",4,4,4.0,1.0,0.0,"Not Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",1,"createOrReplaceTempView at QualificationInfoUtils.scala:133",2,2,2.0,1.0,0.0,"Not Recommended"
"Rapids Spark Profiling Tool Unit Tests","local-1623281204390",6,"json at QualificationInfoUtils.scala:130",110,-49,110.0,0.99,-0.01,"Not Recommended"
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
App Name,App ID,Recommendation,Estimated GPU Speedup,Estimated GPU Duration,Estimated GPU Time Saved,SQL DF Duration,SQL Dataframe Task Duration,App Duration,GPU Opportunity,Executor CPU Time Percent,SQL Ids with Failures,Unsupported Read File Formats and Types,Unsupported Write Data Format,Complex Types,Nested Complex Types,Potential Problems,Longest SQL Duration,NONSQL Task Duration Plus Overhead,Unsupported Task Duration,Supported SQL DF Task Duration,Task Speedup Factor,App Duration Estimated,Unsupported Execs,Unsupported Expressions,Estimated Job Frequency (monthly)
"Spark shell","local-1624371544219","Not Recommended",1.0,174808.87,484.12,6695,20421,175293,832,72.15,"","JSON[string:double:date:int:bigint];Text[*]","JSON","","","",1859,175857,17882,2539,2.39,false,"CollectLimit;Scan json;Execute InsertIntoHadoopFsRelationCommand json;Scan text","",30
"Spark shell","local-1624371544219","Not Recommended",1.0,174691.42,601.57,6695,20421,175293,1034,72.15,"","JSON[string:double:date:int:bigint];Text[*]","JSON","","","",1859,175857,17266,3155,2.39,false,"CollectLimit;Scan json;Execute InsertIntoHadoopFsRelationCommand json;Scan text","",30
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
App Name,App ID,Recommendation,Estimated GPU Speedup,Estimated GPU Duration,Estimated GPU Time Saved,SQL DF Duration,SQL Dataframe Task Duration,App Duration,GPU Opportunity,Executor CPU Time Percent,SQL Ids with Failures,Unsupported Read File Formats and Types,Unsupported Write Data Format,Complex Types,Nested Complex Types,Potential Problems,Longest SQL Duration,NONSQL Task Duration Plus Overhead,Unsupported Task Duration,Supported SQL DF Task Duration,Task Speedup Factor,App Duration Estimated,Unsupported Execs,Unsupported Expressions,Estimated Job Frequency (monthly)
"Spark shell","local-1624371906627","Not Recommended",1.0,83316.93,421.06,6760,21802,83738,723,71.3,"","Text[*];json[double]","JSON","","","",1984,82505,19467,2335,2.39,false,"BatchScan json;Execute InsertIntoHadoopFsRelationCommand json;CollectLimit;Scan text","",30
"Spark shell","local-1624371906627","Not Recommended",1.0,83172.84,565.15,6760,21802,83738,971,71.3,"","Text[*];json[double]","JSON","","","",1984,82505,18668,3134,2.39,false,"BatchScan json;Execute InsertIntoHadoopFsRelationCommand json;CollectLimit;Scan text","",30
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
App Name,App ID,Recommendation,Estimated GPU Speedup,Estimated GPU Duration,Estimated GPU Time Saved,SQL DF Duration,SQL Dataframe Task Duration,App Duration,GPU Opportunity,Executor CPU Time Percent,SQL Ids with Failures,Unsupported Read File Formats and Types,Unsupported Write Data Format,Complex Types,Nested Complex Types,Potential Problems,Longest SQL Duration,NONSQL Task Duration Plus Overhead,Unsupported Task Duration,Supported SQL DF Task Duration,Task Speedup Factor,App Duration Estimated,Unsupported Execs,Unsupported Expressions,Estimated Job Frequency (monthly)
"Spark shell","local-1634253215009","Not Recommended",1.01,46361.16,701.83,1520,359,47063,999,67.64,"","Text[*]","","","","",1068,44935,123,236,3.36,false,"CollectLimit;Scan text","",30
"Spark shell","local-1634253215009","Not Recommended",1.01,46352.24,710.75,1520,359,47063,1011,67.64,"","Text[*]","","","","",1068,44935,120,239,3.36,false,"CollectLimit;Scan text","",30
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
App Name,App ID,Recommendation,Estimated GPU Speedup,Estimated GPU Duration,Estimated GPU Time Saved,SQL DF Duration,SQL Dataframe Task Duration,App Duration,GPU Opportunity,Executor CPU Time Percent,SQL Ids with Failures,Unsupported Read File Formats and Types,Unsupported Write Data Format,Complex Types,Nested Complex Types,Potential Problems,Longest SQL Duration,NONSQL Task Duration Plus Overhead,Unsupported Task Duration,Supported SQL DF Task Duration,Task Speedup Factor,App Duration Estimated,Unsupported Execs,Unsupported Expressions,Estimated Job Frequency (monthly)
"Rapids Spark Profiling Tool Unit Tests","local-1622043423018","Not Recommended",1.08,4484.54,387.45,1306,14353,4872,548,62.67,"","","JSON","","","",1306,4477,8328,6025,3.41,true,"SerializeFromObject;Execute InsertIntoHadoopFsRelationCommand json;DeserializeToObject;Filter;MapElements;Scan","",30
"Rapids Spark Profiling Tool Unit Tests","local-1622043423018","Not Recommended",1.09,4468.98,403.01,1306,14353,4872,570,62.67,"","","JSON","","","",1306,4477,8086,6267,3.41,true,"SerializeFromObject;Execute InsertIntoHadoopFsRelationCommand json;DeserializeToObject;Filter;MapElements;Scan","",30

0 comments on commit 2607489

Please sign in to comment.