forked from NVIDIA/spark-rapids-tools
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[FEA] Qualification tool triggers the AutoTuner module (NVIDIA#739)
* [FEA] Qualification tool triggers the AtutoTuner module Fixes NVIDIA#700 This is an incremental step toward the full automation of App migration to GPU. - Add Qual arg `--auto-tuner` to toggle the AutoTuner module. Default is Off. - Add Qual arg `--worker-info` to pass the GPU worker info to the Qual's AutoTuner. - When AutoTuner is enabled, the Qual tool will launch the AutoTuner module to make some basic recommendations/comments based on the Spark/Env properties. - A new folder `rapids_4_spark_qualification_output/tuning` is created which contains a text formatted file for each app. Each file is named after the AppID. - No unit-tests is added for now because: 1- the recommendations are based on the Profiler's implementation; and the feature is disabled by default. - There will be followup to incrementally split the logic of the AutoTuner into two classes that aim to tailor the rules/policies of the recommendations to the CPU applications. --------- Signed-off-by: Ahmed Hussein (amahussein) <[email protected]>
- Loading branch information
1 parent
d69dab8
commit a154c0b
Showing
7 changed files
with
275 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
55 changes: 55 additions & 0 deletions
55
core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/QualAppSummaryInfoProvider.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
/* | ||
* Copyright (c) 2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.nvidia.spark.rapids.tool.tuning | ||
|
||
import com.nvidia.spark.rapids.tool.profiling.AppSummaryInfoBaseProvider | ||
|
||
import org.apache.spark.sql.rapids.tool.qualification.{QualificationAppInfo, QualificationSummaryInfo} | ||
|
||
/** | ||
* Implementation of AppInfoPropertyGetter to wrap the output of the Qualification analysis. | ||
* @param appInfo the main QualificationAppInfo object representing the CPU application. | ||
* @param appAggStats optional stats aggregate is included here for future improvement as we may | ||
* need to feed the autotuner with values from the aggregates. | ||
*/ | ||
class QualAppSummaryInfoProvider( | ||
val appInfo: QualificationAppInfo, | ||
val appAggStats: Option[QualificationSummaryInfo]) extends AppSummaryInfoBaseProvider { | ||
override def isAppInfoAvailable = true | ||
private def findPropertyInternal( | ||
key: String, props: collection.Map[String, String]): Option[String] = { | ||
props.get(key) | ||
} | ||
|
||
override def getSparkProperty(propKey: String): Option[String] = { | ||
findPropertyInternal(propKey, appInfo.sparkProperties) | ||
} | ||
|
||
override def getRapidsProperty(propKey: String): Option[String] = { | ||
getSparkProperty(propKey) | ||
} | ||
|
||
override def getSystemProperty(propKey: String): Option[String] = { | ||
findPropertyInternal(propKey, appInfo.systemProperties) | ||
} | ||
|
||
override def getSparkVersion: Option[String] = { | ||
Option(appInfo.sparkVersion) | ||
} | ||
|
||
def getAppID: String = appInfo.appId | ||
} |
75 changes: 75 additions & 0 deletions
75
core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/QualificationAutoTuner.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
/* | ||
* Copyright (c) 2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.nvidia.spark.rapids.tool.tuning | ||
|
||
import scala.util.{Failure, Success, Try} | ||
|
||
import com.nvidia.spark.rapids.tool.ToolTextFileWriter | ||
import com.nvidia.spark.rapids.tool.profiling.{AppSummaryInfoBaseProvider, AutoTuner, Profiler} | ||
import org.apache.hadoop.conf.Configuration | ||
|
||
import org.apache.spark.internal.Logging | ||
import org.apache.spark.sql.rapids.tool.qualification.{QualificationAppInfo, QualificationSummaryInfo} | ||
|
||
/** | ||
* Implementation of the AutoTuner for Qualification. | ||
* @param appInfoProvider Provider of the qualification analysis data | ||
* @param tunerContext Container which holds the arguments passed to the AutoTuner execution | ||
*/ | ||
class QualificationAutoTuner(val appInfoProvider: QualAppSummaryInfoProvider, | ||
val tunerContext: TunerContext) { | ||
|
||
private def writeTuningReport(tuningResult: TuningResult, | ||
outputDir: String, hadoopConf: Configuration): Unit = { | ||
val textFileWriter = new ToolTextFileWriter(outputDir, | ||
s"${tuningResult.appID}.log", s"Tuning Qual app - ${tuningResult.appID}", Option(hadoopConf)) | ||
try { | ||
textFileWriter.write(s"### Recommended Configuration for App: ${tuningResult.appID} ###\n") | ||
textFileWriter.write(Profiler.getAutoTunerResultsAsString( | ||
tuningResult.recommendations, tuningResult.comments)) | ||
} finally { | ||
textFileWriter.close() | ||
} | ||
} | ||
def runAutoTuner(): TuningResult = { | ||
val autoTuner: AutoTuner = AutoTuner.buildAutoTuner( | ||
tunerContext.workerInfoPath, appInfoProvider, tunerContext.platform) | ||
val (recommendations, comments) = autoTuner.getRecommendedProperties() | ||
val resultRecord = TuningResult(appInfoProvider.getAppID, recommendations, comments) | ||
writeTuningReport(resultRecord, tunerContext.getOutputPath, tunerContext.hadoopConf) | ||
resultRecord | ||
} | ||
} | ||
|
||
object QualificationAutoTuner extends Logging { | ||
def apply(appInfo: QualificationAppInfo, | ||
appAggStats: Option[QualificationSummaryInfo], | ||
tunerContext: TunerContext): Option[QualificationAutoTuner] = { | ||
Try { | ||
val qualInfoProvider: QualAppSummaryInfoProvider = | ||
AppSummaryInfoBaseProvider.fromQualAppInfo(appInfo, appAggStats) | ||
.asInstanceOf[QualAppSummaryInfoProvider] | ||
new QualificationAutoTuner(qualInfoProvider, tunerContext) | ||
} match { | ||
case Success(q) => Some(q) | ||
case Failure(e) => | ||
logError( | ||
s"Failed to create Qualification tuning object for application ${appInfo.appId}", e) | ||
None | ||
} | ||
} | ||
} |
84 changes: 84 additions & 0 deletions
84
core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/TunerContext.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
/* | ||
* Copyright (c) 2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.nvidia.spark.rapids.tool.tuning | ||
|
||
import scala.util.{Failure, Success, Try} | ||
|
||
import com.nvidia.spark.rapids.tool.Platform | ||
import com.nvidia.spark.rapids.tool.profiling.{RecommendedCommentResult, RecommendedPropertyResult} | ||
import org.apache.hadoop.conf.Configuration | ||
|
||
import org.apache.spark.internal.Logging | ||
import org.apache.spark.sql.rapids.tool.qualification.{QualificationAppInfo, QualificationSummaryInfo} | ||
import org.apache.spark.sql.rapids.tool.util.RapidsToolsConfUtil | ||
|
||
case class TuningResult( | ||
appID: String, | ||
recommendations: Seq[RecommendedPropertyResult], | ||
comments: Seq[RecommendedCommentResult]) | ||
|
||
/** | ||
* Container which holds metadata and arguments specific to the execution of the AutoTuner. | ||
* TODO: we need to use the same class in constructing the AutoTuner in the Profiling tools. | ||
* @param platform object representing the host platform on which the application was executed. | ||
* @param workerInfoPath the path of the GPU workers | ||
* @param outputRootDir the output directory to dump the recommendation/comments. | ||
* @param hadoopConf optional configuration to access the remote storage. | ||
*/ | ||
case class TunerContext ( | ||
platform: Platform, | ||
workerInfoPath: String, | ||
outputRootDir: String, | ||
hadoopConf: Configuration) extends Logging { | ||
|
||
def getOutputPath: String = { | ||
s"$outputRootDir/rapids_4_spark_qualification_output/tuning" | ||
} | ||
|
||
def tuneApplication( | ||
appInfo: QualificationAppInfo, | ||
appAggStats: Option[QualificationSummaryInfo]): Option[TuningResult] = { | ||
QualificationAutoTuner(appInfo, appAggStats, this).collect { | ||
case qualTuner => | ||
Try { | ||
qualTuner.runAutoTuner() | ||
} match { | ||
case Success(r) => r | ||
case Failure(e) => | ||
logError(s"Failed to generate tuning recommendations for app: ${appInfo.appId}", e) | ||
null | ||
} | ||
} | ||
} | ||
} | ||
|
||
object TunerContext extends Logging { | ||
def apply(platform: Platform, | ||
workerInfoPath: String, | ||
outputRootDir: String, | ||
hadoopConf: Option[Configuration] = None): Option[TunerContext] = { | ||
Try { | ||
val hConf = hadoopConf.getOrElse(RapidsToolsConfUtil.newHadoopConf()) | ||
TunerContext(platform, workerInfoPath, outputRootDir, hConf) | ||
} match { | ||
case Success(c) => Some(c) | ||
case Failure(e) => | ||
logError("Could not create Tuner Context", e) | ||
None | ||
} | ||
} | ||
} |