-
Notifications
You must be signed in to change notification settings - Fork 40
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[FEA] Add stage/task level diagnostic output for GPU slowness in Prof…
…iler tool (#1375) * initial implementation Signed-off-by: cindyyuanjiang <[email protected]> * updated output schema based on offline discussion Signed-off-by: cindyyuanjiang <[email protected]> * address feedback to merge two tables together Signed-off-by: cindyyuanjiang <[email protected]> * update order of columns Signed-off-by: cindyyuanjiang <[email protected]> * get gpu semaphore time Signed-off-by: cindyyuanjiang <[email protected]> * add benchmark Signed-off-by: cindyyuanjiang <[email protected]> * clean up code Signed-off-by: cindyyuanjiang <[email protected]> * add unit test Signed-off-by: cindyyuanjiang <[email protected]> * new expectation file Signed-off-by: cindyyuanjiang <[email protected]> * address review feedback Signed-off-by: cindyyuanjiang <[email protected]> * address review feedback Signed-off-by: cindyyuanjiang <[email protected]> * add new file Signed-off-by: cindyyuanjiang <[email protected]> * remove unnecessary comment Signed-off-by: cindyyuanjiang <[email protected]> * address review feedback Signed-off-by: cindyyuanjiang <[email protected]> * refactored for memory optimization Signed-off-by: cindyyuanjiang <[email protected]> * addressed review feedback Signed-off-by: cindyyuanjiang <[email protected]> * refactor stageDiagnosticResults Signed-off-by: cindyyuanjiang <[email protected]> * change num attemps to tasks Signed-off-by: cindyyuanjiang <[email protected]> * remove diagnostic from applicationsummaryinfo Signed-off-by: cindyyuanjiang <[email protected]> * remove unused import Signed-off-by: cindyyuanjiang <[email protected]> * new file Signed-off-by: cindyyuanjiang <[email protected]> * add diagnostic view in qual tool output Signed-off-by: cindyyuanjiang <[email protected]> * remove diagnostic vire from qual tool profile.log file Signed-off-by: cindyyuanjiang <[email protected]> * address review feedback Signed-off-by: cindyyuanjiang <[email protected]> * add profile benchmark class Signed-off-by: cindyyuanjiang <[email protected]> * fix profiler benchmark Signed-off-by: cindyyuanjiang <[email protected]> --------- Signed-off-by: cindyyuanjiang <[email protected]>
- Loading branch information
1 parent
9cba927
commit de40e8d
Showing
19 changed files
with
560 additions
and
87 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
36 changes: 36 additions & 0 deletions
36
core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AnalysisUtils.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
/* | ||
* Copyright (c) 2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.nvidia.spark.rapids.tool.analysis | ||
|
||
object StageAccumDiagnosticMetrics { | ||
val MEMORY_SPILLED_METRIC = "internal.metrics.memoryBytesSpilled" | ||
val DISK_SPILLED_METRIC = "internal.metrics.diskBytesSpilled" | ||
val INPUT_BYTES_READ_METRIC = "internal.metrics.input.bytesRead" | ||
val OUTPUT_BYTES_WRITTEN_METRIC = "internal.metrics.output.bytesWritten" | ||
val SW_TOTAL_BYTES_METRIC = "internal.metrics.shuffle.write.bytesWritten" | ||
val SR_FETCH_WAIT_TIME_METRIC = "internal.metrics.shuffle.read.fetchWaitTime" | ||
val SW_WRITE_TIME_METRIC = "internal.metrics.shuffle.write.writeTime" | ||
val GPU_SEMAPHORE_WAIT_METRIC = "gpuSemaphoreWait" | ||
|
||
/** | ||
* Get all diagnostic metrics | ||
*/ | ||
def getAllDiagnosticMetrics: Set[String] = Set(MEMORY_SPILLED_METRIC, | ||
DISK_SPILLED_METRIC, INPUT_BYTES_READ_METRIC, OUTPUT_BYTES_WRITTEN_METRIC, | ||
SW_TOTAL_BYTES_METRIC, SR_FETCH_WAIT_TIME_METRIC, SW_WRITE_TIME_METRIC, | ||
GPU_SEMAPHORE_WAIT_METRIC) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
25 changes: 25 additions & 0 deletions
25
core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/DiagnosticSummaryInfo.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
/* | ||
* Copyright (c) 2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.nvidia.spark.rapids.tool.profiling | ||
|
||
/** | ||
* Stores Profiler diagnostic info. | ||
* TODO: We plan to add two more fields/views in upcoming PRs. | ||
*/ | ||
case class DiagnosticSummaryInfo( | ||
stageDiagnostics: Seq[StageDiagnosticResult] | ||
) |
Oops, something went wrong.