From 20f515250b1772ee691d44c6fb0e423a1b66b706 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 16 Apr 2024 13:45:30 -0500 Subject: [PATCH] Add support for self-contained profiling Signed-off-by: Jason Lowe --- .../com/nvidia/spark/rapids/Plugin.scala | 5 + .../spark/rapids/RangeConfMatcher.scala | 87 ++++ .../com/nvidia/spark/rapids/RapidsConf.scala | 83 ++++ .../com/nvidia/spark/rapids/profiler.scala | 387 ++++++++++++++++++ .../sql/rapids/execution/TrampolineUtil.scala | 14 +- .../spark/rapids/RangeConfMatcherSuite.scala | 82 ++++ 6 files changed, 654 insertions(+), 4 deletions(-) create mode 100644 sql-plugin/src/main/scala/com/nvidia/spark/rapids/RangeConfMatcher.scala create mode 100644 sql-plugin/src/main/scala/com/nvidia/spark/rapids/profiler.scala create mode 100644 sql-plugin/src/test/scala/com/nvidia/spark/rapids/RangeConfMatcherSuite.scala diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala index bd5d76e2337..74204e2d698 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala @@ -446,6 +446,7 @@ class RapidsDriverPlugin extends DriverPlugin with Logging { } rapidsShuffleHeartbeatManager.executorHeartbeat(id) case m: GpuCoreDumpMsg => GpuCoreDumpHandler.handleMsg(m) + case m: ProfileMsg => ProfilerOnDriver.handleMsg(m) case m => throw new IllegalStateException(s"Unknown message $m") } } @@ -458,6 +459,7 @@ class RapidsDriverPlugin extends DriverPlugin with Logging { RapidsPluginUtils.detectMultipleJars(conf) RapidsPluginUtils.logPluginMode(conf) GpuCoreDumpHandler.driverInit(sc, conf) + ProfilerOnDriver.init(sc, conf) if (GpuShuffleEnv.isRapidsShuffleAvailable(conf)) { GpuShuffleEnv.initShuffleManager() @@ -507,6 +509,7 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging { val sparkConf = pluginContext.conf() val numCores = RapidsPluginUtils.estimateCoresOnExec(sparkConf) val conf = new RapidsConf(extraConf.asScala.toMap) + ProfilerOnExecutor.init(pluginContext, conf) // Checks if the current GPU architecture is supported by the // spark-rapids-jni and cuDF libraries. @@ -656,6 +659,7 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging { GpuSemaphore.shutdown() PythonWorkerSemaphore.shutdown() GpuDeviceManager.shutdown() + ProfilerOnExecutor.shutdown() Option(rapidsShuffleHeartbeatEndpoint).foreach(_.close()) extraExecutorPlugins.foreach(_.shutdown()) FileCache.shutdown() @@ -692,6 +696,7 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging { override def onTaskStart(): Unit = { startTaskNvtx(TaskContext.get) extraExecutorPlugins.foreach(_.onTaskStart()) + ProfilerOnExecutor.onTaskStart() } override def onTaskSucceeded(): Unit = { diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RangeConfMatcher.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RangeConfMatcher.scala new file mode 100644 index 00000000000..951e8f71990 --- /dev/null +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RangeConfMatcher.scala @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids + +import scala.util.Try + +/** + * Determines if a value is in a comma-separated list of values and/or + * hyphenated ranges provided by the user for a configuration setting. + */ +class RangeConfMatcher(configKey: String, configValue: Option[String]) { + def this(conf: RapidsConf, entry: ConfEntry[String]) = { + this(entry.key, Some(conf.get(entry))) + } + + def this(conf: RapidsConf, entry: OptionalConfEntry[String]) = { + this(entry.key, conf.get(entry)) + } + + private val (stringSet, intRanges) = { + configValue.map { cv => + val parts = cv.split(',') + val (rangeParts, singleParts) = parts.partition(_.contains('-')) + val ranges = try { + rangeParts.map(RangeConfMatcher.parseRange) + } catch { + case e: IllegalArgumentException => + throw new IllegalArgumentException(s"Invalid range settings for $configKey: $cv", e) + } + (singleParts.map(_.trim).toSet, ranges) + }.getOrElse((Set.empty[String], Array.empty[(Int, Int)])) + } + + val isEmpty: Boolean = stringSet.isEmpty && intRanges.isEmpty + val nonEmpty: Boolean = !isEmpty + + def size: Int = { + stringSet.size + intRanges.map { + case (start, end) => end - start + 1 + }.sum + } + + /** Returns true if the string value is in the configured values or ranges. */ + def contains(v: String): Boolean = { + stringSet.contains(v) || (intRanges.nonEmpty && Try(v.toInt).map(checkRanges).getOrElse(false)) + } + + /** Returns true if the integer value is in the configured values or ranges. */ + def contains(v: Int): Boolean = { + checkRanges(v) || stringSet.contains(v.toString) + } + + private def checkRanges(v: Int): Boolean = { + intRanges.exists { + case (start, end) => start <= v && v <= end + } + } +} + +object RangeConfMatcher { + def parseRange(rangeStr: String): (Int,Int) = { + val rangePair = rangeStr.split('-') + if (rangePair.length != 2) { + throw new IllegalArgumentException(s"Invalid range: $rangeStr") + } + val start = rangePair.head.trim.toInt + val end = rangePair.last.trim.toInt + if (end < start) { + throw new IllegalArgumentException(s"Invalid range: $rangeStr") + } + (start, end) + } +} \ No newline at end of file diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index f7ff38cb193..1f9380ddbd0 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -708,6 +708,71 @@ val GPU_COREDUMP_PIPE_PATTERN = conf("spark.rapids.gpu.coreDump.pipePattern") .checkValues(Set("DEBUG", "MODERATE", "ESSENTIAL")) .createWithDefault("MODERATE") + val PROFILE_PATH = conf("spark.rapids.profile.pathPrefix") + .doc("Enables profiling and specifies a URI path to use when writing profile data") + .internal() + .stringConf + .createOptional + + val PROFILE_EXECUTORS = conf("spark.rapids.profile.executors") + .doc("Comma-separated list of executors IDs and hyphenated ranges of executor IDs to " + + "profile when profiling is enabled") + .internal() + .stringConf + .createWithDefault("0") + + val PROFILE_TIME_RANGES_SECONDS = conf("spark.rapids.profile.timeRangesInSeconds") + .doc("Comma-separated list of start-end ranges of time, in seconds, since executor startup " + + "to start and stop profiling. For example, a value of 10-30,100-110 will have the profiler " + + "wait for 10 seconds after executor startup then profile for 20 seconds, then wait for " + + "70 seconds then profile again for the next 10 seconds") + .internal() + .stringConf + .createOptional + + val PROFILE_JOBS = conf("spark.rapids.profile.jobs") + .doc("Comma-separated list of job IDs and hyphenated ranges of job IDs to " + + "profile when profiling is enabled") + .internal() + .stringConf + .createOptional + + val PROFILE_STAGES = conf("spark.rapids.profile.stages") + .doc("Comma-separated list of stage IDs and hyphenated ranges of stage IDs to " + + "profile when profiling is enabled") + .internal() + .stringConf + .createOptional + + val PROFILE_DRIVER_POLL_MILLIS = conf("spark.rapids.profile.driverPollMillis") + .doc("Interval in milliseconds the executors will poll for job and stage completion when " + + "stage-level profiling is used.") + .internal() + .integerConf + .createWithDefault(1000) + + val PROFILE_COMPRESSION = conf("spark.rapids.profile.compression") + .doc("Specifies the compression codec to use when writing profile data, one of " + + "zstd or none") + .internal() + .stringConf + .transform(_.toLowerCase(java.util.Locale.ROOT)) + .checkValues(Set("zstd", "none")) + .createWithDefault("zstd") + + val PROFILE_FLUSH_PERIOD_MILLIS = conf("spark.rapids.profile.flushPeriodMillis") + .doc("Specifies the time period in milliseconds to flush profile records. " + + "A value <= 0 will disable time period flushing.") + .internal() + .integerConf + .createWithDefault(0) + + val PROFILE_WRITE_BUFFER_SIZE = conf("spark.rapids.profile.writeBufferSize") + .doc("Buffer size to use when writing profile records.") + .internal() + .bytesConf(ByteUnit.BYTE) + .createWithDefault(1024 * 1024) + // ENABLE/DISABLE PROCESSING val SQL_ENABLED = conf("spark.rapids.sql.enabled") @@ -2397,6 +2462,24 @@ class RapidsConf(conf: Map[String, String]) extends Logging { lazy val metricsLevel: String = get(METRICS_LEVEL) + lazy val profilePath: Option[String] = get(PROFILE_PATH) + + lazy val profileExecutors: String = get(PROFILE_EXECUTORS) + + lazy val profileTimeRangesSeconds: Option[String] = get(PROFILE_TIME_RANGES_SECONDS) + + lazy val profileJobs: Option[String] = get(PROFILE_JOBS) + + lazy val profileStages: Option[String] = get(PROFILE_STAGES) + + lazy val profileDriverPollMillis: Int = get(PROFILE_DRIVER_POLL_MILLIS) + + lazy val profileCompression: String = get(PROFILE_COMPRESSION) + + lazy val profileFlushPeriodMillis: Int = get(PROFILE_FLUSH_PERIOD_MILLIS) + + lazy val profileWriteBufferSize: Long = get(PROFILE_WRITE_BUFFER_SIZE) + lazy val isSqlEnabled: Boolean = get(SQL_ENABLED) lazy val isSqlExecuteOnGPU: Boolean = get(SQL_MODE).equals("executeongpu") diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/profiler.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/profiler.scala new file mode 100644 index 00000000000..2714c655b5b --- /dev/null +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/profiler.scala @@ -0,0 +1,387 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids + +import java.nio.ByteBuffer +import java.nio.channels.{Channels, WritableByteChannel} +import java.util.concurrent.{ConcurrentHashMap, Future, ScheduledExecutorService, TimeUnit} +import java.util.regex.Pattern + +import scala.collection.mutable + +import com.nvidia.spark.rapids.jni.Profiler +import org.apache.hadoop.fs.Path +import org.apache.hadoop.ipc.CallerContext + +import org.apache.spark.{SparkContext, TaskContext} +import org.apache.spark.api.plugin.PluginContext +import org.apache.spark.internal.Logging +import org.apache.spark.io.CompressionCodec +import org.apache.spark.scheduler.{SparkListener, SparkListenerJobEnd, SparkListenerStageCompleted} +import org.apache.spark.sql.rapids.execution.TrampolineUtil +import org.apache.spark.util.SerializableConfiguration + +object ProfilerOnExecutor extends Logging { + private val jobPattern = Pattern.compile("SPARK_.*_JId_([0-9]+).*") + private var writer: Option[ProfileWriter] = None + private var timeRanges: Option[Seq[(Long, Long)]] = None + private var jobRanges: RangeConfMatcher = null + private var stageRanges: RangeConfMatcher = null + // NOTE: Active sets are updated asynchronously, synchronize on ProfilerOnExecutor to access + private val activeJobs = mutable.HashSet[Int]() + private val activeStages = mutable.HashSet[Int]() + private var timer: Option[ScheduledExecutorService] = None + private var timerFuture: Option[Future[_]] = None + private var driverPollMillis = 0 + private val startTimestamp = System.nanoTime() + private var isProfileActive = false + + def init(pluginCtx: PluginContext, conf: RapidsConf): Unit = { + require(writer.isEmpty, "Already initialized") + timeRanges = conf.profileTimeRangesSeconds.map(parseTimeRanges) + jobRanges = new RangeConfMatcher(conf, RapidsConf.PROFILE_JOBS) + stageRanges = new RangeConfMatcher(conf, RapidsConf.PROFILE_STAGES) + driverPollMillis = conf.profileDriverPollMillis + if (timeRanges.isDefined && (stageRanges.nonEmpty || jobRanges.nonEmpty)) { + throw new UnsupportedOperationException( + "Profiling with time ranges and stage or job ranges simultaneously is not supported") + } + writer = conf.profilePath.flatMap { pathPrefix => + val executorId = pluginCtx.executorID() + if (shouldProfile(executorId, conf)) { + logInfo("Initializing profiler") + if (jobRanges.nonEmpty) { + // Need caller context enabled to get the job ID of a task on the executor + TrampolineUtil.getSparkHadoopUtilConf.setBoolean("hadoop.caller.context.enabled", true) + } + val codec = conf.profileCompression match { + case "none" => None + case c => Some(TrampolineUtil.createCodec(pluginCtx.conf(), c)) + } + val w = new ProfileWriter(pluginCtx, pathPrefix, codec) + Profiler.init(w, conf.profileWriteBufferSize, conf.profileFlushPeriodMillis) + Some(w) + } else { + None + } + } + writer.foreach { _ => + updateAndSchedule() + } + } + + def onTaskStart(): Unit = { + if (jobRanges.nonEmpty) { + val callerCtx = CallerContext.getCurrent + if (callerCtx != null) { + val m = jobPattern.matcher(callerCtx.getContext) + if (m.matches()) { + val jobId = m.group(1).toInt + if (jobRanges.contains(jobId)) { + synchronized { + activeJobs.add(jobId) + enable() + startPollingDriver() + } + } + } + } + } + if (stageRanges.nonEmpty) { + val taskCtx = TaskContext.get + val stageId = taskCtx.stageId + if (stageRanges.contains(stageId)) { + synchronized { + activeStages.add(taskCtx.stageId) + enable() + startPollingDriver() + } + } + } + } + + def shutdown(): Unit = { + writer.foreach { w => + timerFuture.foreach(_.cancel(false)) + timerFuture = None + Profiler.shutdown() + w.close() + } + writer = None + } + + private def enable(): Unit = { + writer.foreach { w => + if (!isProfileActive) { + Profiler.start() + isProfileActive = true + w.pluginCtx.send(ProfileStatusMsg(w.executorId, "profile started")) + } + } + } + + private def disable(): Unit = { + writer.foreach { w => + if (isProfileActive) { + Profiler.stop() + isProfileActive = false + w.pluginCtx.send(ProfileStatusMsg(w.executorId, "profile stopped")) + } + } + } + + private def shouldProfile(executorId: String, conf: RapidsConf): Boolean = { + val matcher = new RangeConfMatcher(conf, RapidsConf.PROFILE_EXECUTORS) + matcher.contains(executorId) + } + + private def parseTimeRanges(confVal: String): Seq[(Long, Long)] = { + val ranges = try { + confVal.split(',').map(RangeConfMatcher.parseRange).map { + case (start, end) => + // convert relative time in seconds to absolute time in nanoseconds + (startTimestamp + TimeUnit.SECONDS.toNanos(start), + startTimestamp + TimeUnit.SECONDS.toNanos(end)) + } + } catch { + case e: IllegalArgumentException => + throw new IllegalArgumentException( + s"Invalid range settings for ${RapidsConf.PROFILE_TIME_RANGES_SECONDS}: $confVal", e) + } + ranges.sorted.toIndexedSeq + } + + private def updateAndSchedule(): Unit = { + if (timeRanges.isDefined) { + if (timer.isEmpty) { + timer = Some(TrampolineUtil.newDaemonSingleThreadScheduledExecutor("profiler timer")) + } + val now = System.nanoTime() + // skip time ranges that have already passed + val currentRanges = timeRanges.get.dropWhile { + case (_, end) => end <= now + } + timeRanges = Some(currentRanges) + if (currentRanges.isEmpty) { + logWarning("No further time ranges to profile, shutting down") + shutdown() + } else { + currentRanges.headOption.foreach { + case (start, end) => + val delay = if (start <= now) { + enable() + end - now + } else { + disable() + start - now + } + timerFuture = Some(timer.get.schedule(new Runnable { + override def run(): Unit = try { + updateAndSchedule() + } catch { + case e: Exception => + logError(s"Error in profiler timer task", e) + } + }, delay, TimeUnit.NANOSECONDS)) + } + } + } else if (jobRanges.nonEmpty || stageRanges.nonEmpty) { + // nothing to do yet, profiling will start when tasks for targeted job/stage are seen + } else { + enable() + } + } + + private def startPollingDriver(): Unit = { + if (timerFuture.isEmpty) { + if (timer.isEmpty) { + timer = Some(TrampolineUtil.newDaemonSingleThreadScheduledExecutor("profiler timer")) + } + timerFuture = Some(timer.get.scheduleWithFixedDelay(() => try { + updateActiveFromDriver() + } catch { + case e: Exception => + logError("Profiler timer task error: ", e) + }, driverPollMillis, driverPollMillis, TimeUnit.MILLISECONDS)) + } + } + + private def stopPollingDriver(): Unit = { + timerFuture.foreach(_.cancel(false)) + timerFuture = None + } + + private def updateActiveFromDriver(): Unit = { + writer.foreach { w => + val (jobs, stages) = synchronized { + (activeJobs.toArray, activeStages.toArray) + } + val (completedJobs, completedStages, allDone) = + w.pluginCtx.ask(ProfileJobStageQueryMsg(jobs, stages)) + .asInstanceOf[(Array[Int], Array[Int], Boolean)] + if (completedJobs.nonEmpty || completedStages.nonEmpty) { + synchronized { + completedJobs.foreach(activeJobs.remove) + completedStages.foreach(activeStages.remove) + if (activeJobs.isEmpty && activeStages.isEmpty) { + disable() + stopPollingDriver() + } + } + } + if (allDone) { + logWarning("No further jobs or stages to profile, shutting down") + shutdown() + } + } + } +} + +class ProfileWriter( + val pluginCtx: PluginContext, + profilePathPrefix: String, + codec: Option[CompressionCodec]) extends Profiler.DataWriter { + val executorId: String = pluginCtx.executorID() + private val outPath = getOutputPath(profilePathPrefix, codec) + private val out = openOutput(codec) + private var isClosed = false + + override def write(data: ByteBuffer): Unit = { + if (!isClosed) { + while (data.hasRemaining) { + out.write(data) + } + } + } + + override def close(): Unit = { + if (!isClosed) { + isClosed = true + out.close() + pluginCtx.send(ProfileEndMsg(executorId, outPath.toString)) + } + } + + private def getAppId: String = { + val appId = pluginCtx.conf.get("spark.app.id", "") + if (appId.isEmpty) { + java.lang.management.ManagementFactory.getRuntimeMXBean.getName + } else { + appId + } + } + + private def getOutputPath(prefix: String, codec: Option[CompressionCodec]): Path = { + val parentDir = new Path(prefix) + val suffix = codec.map(c => "." + TrampolineUtil.getCodecShortName(c.getClass.getName)) + .getOrElse("") + new Path(parentDir, s"rapids-profile-$getAppId-$executorId.bin$suffix") + } + + private def openOutput(codec: Option[CompressionCodec]): WritableByteChannel = { + val hadoopConf = pluginCtx.ask(ProfileInitMsg(executorId, outPath.toString)) + .asInstanceOf[SerializableConfiguration].value + val fs = outPath.getFileSystem(hadoopConf) + val fsStream = fs.create(outPath, false) + val outStream = codec.map(_.compressedOutputStream(fsStream)).getOrElse(fsStream) + Channels.newChannel(outStream) + } +} + +object ProfilerOnDriver extends Logging { + private var hadoopConf: SerializableConfiguration = null + private var jobRanges: RangeConfMatcher = null + private var numJobsToProfile: Long = 0L + private var stageRanges: RangeConfMatcher = null + private var numStagesToProfile: Long = 0L + private val completedJobs = new ConcurrentHashMap[Int, Unit]() + private val completedStages = new ConcurrentHashMap[Int, Unit]() + private var isJobsStageProfilingComplete = false + + def init(sc: SparkContext, conf: RapidsConf): Unit = { + // if no profile path, profiling is disabled and nothing to do + conf.profilePath.foreach { _ => + hadoopConf = new SerializableConfiguration(sc.hadoopConfiguration) + jobRanges = new RangeConfMatcher(conf, RapidsConf.PROFILE_JOBS) + stageRanges = new RangeConfMatcher(conf, RapidsConf.PROFILE_STAGES) + if (jobRanges.nonEmpty || stageRanges.nonEmpty) { + numJobsToProfile = jobRanges.size + numStagesToProfile = stageRanges.size + if (jobRanges.nonEmpty) { + // Need caller context enabled to get the job ID of a task on the executor + sc.getConf.set("hadoop.caller.context.enabled", "true") + } + sc.addSparkListener(Listener) + } + } + } + + def handleMsg(m: ProfileMsg): AnyRef = m match { + case ProfileInitMsg(executorId, path) => + logWarning(s"Profiling: Executor $executorId initialized profiler, writing to $path") + if (hadoopConf == null) { + throw new IllegalStateException("Hadoop configuration not set") + } + hadoopConf + case ProfileStatusMsg(executorId, msg) => + logWarning(s"Profiling: Executor $executorId: $msg") + null + case ProfileJobStageQueryMsg(activeJobs, activeStages) => + val filteredJobs = activeJobs.filter(j => completedJobs.containsKey(j)) + val filteredStages = activeStages.filter(s => completedStages.containsKey(s)) + (filteredJobs, filteredStages, isJobsStageProfilingComplete) + case ProfileEndMsg(executorId, path) => + logWarning(s"Profiling: Executor $executorId ended profiling, profile written to $path") + null + case _ => + throw new IllegalStateException(s"Unexpected profile msg: $m") + } + + private object Listener extends SparkListener { + override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = { + val jobId = jobEnd.jobId + if (jobRanges.contains(jobId)) { + completedJobs.putIfAbsent(jobId, Unit) + isJobsStageProfilingComplete = completedJobs.size == numJobsToProfile && + completedStages.size == numStagesToProfile + } + } + + override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = { + val stageId = stageCompleted.stageInfo.stageId + if (stageRanges.contains(stageId)) { + completedStages.putIfAbsent(stageId, Unit) + isJobsStageProfilingComplete = completedJobs.size == numJobsToProfile && + completedStages.size == numStagesToProfile + } + } + } +} + +trait ProfileMsg + +case class ProfileInitMsg(executorId: String, path: String) extends ProfileMsg +case class ProfileStatusMsg(executorId: String, msg: String) extends ProfileMsg +case class ProfileEndMsg(executorId: String, path: String) extends ProfileMsg + +// Reply is a tuple of: +// - array of jobs that have completed +// - array of stages that have completed +// - boolean if there are no further jobs/stages to profile +case class ProfileJobStageQueryMsg( + activeJobs: Array[Int], + activeStages: Array[Int]) extends ProfileMsg diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/TrampolineUtil.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/TrampolineUtil.scala index eb16edc0b51..5ffe08348f1 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/TrampolineUtil.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/TrampolineUtil.scala @@ -16,8 +16,9 @@ package org.apache.spark.sql.rapids.execution -import java.util.concurrent.ThreadPoolExecutor +import java.util.concurrent.{ScheduledExecutorService, ThreadPoolExecutor} +import org.apache.hadoop.conf.Configuration import org.json4s.JsonAST import org.apache.spark.{SparkConf, SparkContext, SparkEnv, SparkMasterRegex, SparkUpgradeException, TaskContext} @@ -40,7 +41,7 @@ import org.apache.spark.sql.rapids.shims.DataTypeUtilsShim import org.apache.spark.sql.rapids.shims.SparkUpgradeExceptionShims import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.storage.BlockManagerId -import org.apache.spark.util.{ShutdownHookManager, Utils} +import org.apache.spark.util.{ShutdownHookManager, ThreadUtils, Utils} object TrampolineUtil { def doExecuteBroadcast[T](child: SparkPlan): Broadcast[T] = child.doExecuteBroadcast() @@ -228,11 +229,16 @@ object TrampolineUtil { // We want to utilize the ThreadUtils class' ThreadPoolExecutor creation // which gives us important Hadoop config variables that are needed for the // Unity Catalog authentication - org.apache.spark.util.ThreadUtils.newDaemonCachedThreadPool(prefix, maxThreadNumber, - keepAliveSeconds) + ThreadUtils.newDaemonCachedThreadPool(prefix, maxThreadNumber, keepAliveSeconds) + } + + def newDaemonSingleThreadScheduledExecutor(threadName: String): ScheduledExecutorService = { + ThreadUtils.newDaemonSingleThreadScheduledExecutor(threadName) } def postEvent(sc: SparkContext, sparkEvent: SparkListenerEvent): Unit = { sc.listenerBus.post(sparkEvent) } + + def getSparkHadoopUtilConf: Configuration = SparkHadoopUtil.get.conf } diff --git a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/RangeConfMatcherSuite.scala b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/RangeConfMatcherSuite.scala new file mode 100644 index 00000000000..518bd3012f2 --- /dev/null +++ b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/RangeConfMatcherSuite.scala @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids + +import org.scalatest.funsuite.AnyFunSuite + +class RangeConfMatcherSuite extends AnyFunSuite { + + test("empty") { + val conf = new RapidsConf(Map(RapidsConf.PROFILE_EXECUTORS.key -> "")) + val matcher = new RangeConfMatcher(conf, RapidsConf.PROFILE_EXECUTORS) + assert(!matcher.contains("x")) + assert(!matcher.contains(0)) + } + + test("bad ranges") { + Seq("-", "-4", "4-", "4-3", "d-4", "4-d", "23a-24b", "3-5,8,x-y").foreach { v => + val conf = new RapidsConf(Map(RapidsConf.PROFILE_EXECUTORS.key -> v)) + assertThrows[IllegalArgumentException] { + new RangeConfMatcher(conf, RapidsConf.PROFILE_EXECUTORS) + } + } + } + + test("singles") { + Seq("driver", "0,driver", "0, driver", "driver, 0", "1, driver, x").foreach { v => + val conf = new RapidsConf(Map(RapidsConf.PROFILE_EXECUTORS.key -> v)) + val matcher = new RangeConfMatcher(conf, RapidsConf.PROFILE_EXECUTORS) + assert(matcher.contains("driver")) + assert(!matcher.contains("driverx")) + assert(!matcher.contains("xdriver")) + assert(!matcher.contains("drive")) + assert(!matcher.contains("drive")) + } + } + + test("range only") { + Seq("7-7", "3-7", "2-30", "2-3,5-7,8-10", "2-3, 5-7, 8-10", + " 2 - 3, 5 - 7, 8 - 10").foreach { v => + val conf = new RapidsConf(Map(RapidsConf.PROFILE_EXECUTORS.key -> v)) + val matcher = new RangeConfMatcher(conf, RapidsConf.PROFILE_EXECUTORS) + assert(matcher.contains("7")) + assert(matcher.contains(7)) + assert(!matcher.contains("0")) + assert(!matcher.contains(0)) + assert(!matcher.contains("70")) + assert(!matcher.contains(70)) + } + } + + test("singles range mix") { + Seq("driver,7-10", "driver, 7 - 10", "driver, 7-10", "3-5,7,1-3,driver").foreach { v => + val conf = new RapidsConf(Map(RapidsConf.PROFILE_EXECUTORS.key -> v)) + val matcher = new RangeConfMatcher(conf, RapidsConf.PROFILE_EXECUTORS) + assert(matcher.contains("driver")) + assert(!matcher.contains("driverx")) + assert(!matcher.contains("xdriver")) + assert(!matcher.contains("drive")) + assert(!matcher.contains("drive")) + assert(matcher.contains("7")) + assert(matcher.contains(7)) + assert(!matcher.contains("0")) + assert(!matcher.contains(0)) + assert(!matcher.contains("70")) + assert(!matcher.contains(70)) + } + } +}