-
Notifications
You must be signed in to change notification settings - Fork 39
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add support of HiveTableScan and InsertIntoHive text-format
Fixes #681 - Add unit-test to support scan-hive-text - The check for the SerDe class to determine if the Hive format is supported or not - Only text-hive is supported for now. - Updated the read-format to include the ScanHive operations. - Added support to NativeScan as an alternative to the "Node Scan" Signed-off-by: Ahmed Hussein (amahussein) <[email protected]>
- Loading branch information
1 parent
2883413
commit 30ac07a
Showing
16 changed files
with
859 additions
and
77 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
101 changes: 101 additions & 0 deletions
101
core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/HiveParseHelper.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
/* | ||
* Copyright (c) 2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.nvidia.spark.rapids.tool.planparser | ||
|
||
import org.apache.spark.internal.Logging | ||
import org.apache.spark.sql.execution.ui.SparkPlanGraphNode | ||
import org.apache.spark.sql.rapids.tool.util.EventUtils | ||
|
||
// A wrapper class to map between | ||
case class HiveScanSerdeClasses(className: String, format: String) extends Logging { | ||
def parseReadNode(node: SparkPlanGraphNode): ReadMetaData = { | ||
logDebug(s"Parsing node as ScanHiveTable: ${node.desc}") | ||
// Schema, pushedFilters empty for now as we cannot extract them yet from eventlogs | ||
ReadMetaData("", "HiveTableRelation", "unknown", format) | ||
} | ||
} | ||
|
||
// Utilities used to handle Hive Ops. | ||
object HiveParseHelper extends Logging { | ||
val SCAN_HIVE_LABEL = "scan hive" | ||
val SCAN_HIVE_EXEC_NAME = "HiveTableScanExec" | ||
val INSERT_INTO_HIVE_LABEL = "InsertIntoHiveTable" | ||
|
||
// The following is a list of Classes we can look for is SerDe. | ||
// We should maintain this table with custom classes as needed. | ||
// Note that we map each SerDe to a format "Hive*" because the hive formats are still different | ||
// compared to the native onces according to the documentation. For example, this is why the | ||
// the "supportedDataSource.csv" has a "HiveText" entry. | ||
private val LOADED_SERDE_CLASSES = Seq( | ||
HiveScanSerdeClasses("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "HiveText"), | ||
HiveScanSerdeClasses("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe", | ||
"HiveParquet"), | ||
HiveScanSerdeClasses("org.apache.hadoop.hive.serde2.avro.AvroSerDe", "HiveAvro"), | ||
HiveScanSerdeClasses("org.apache.hadoop.hive.serde2.OpenCSVSerde", "HiveCSV"), | ||
HiveScanSerdeClasses("org.apache.hadoop.hive.ql.io.orc.OrcSerde", "HiveORC") | ||
) | ||
|
||
def isHiveTableInsertNode(nodeName: String): Boolean = { | ||
nodeName.contains(INSERT_INTO_HIVE_LABEL) | ||
} | ||
|
||
def isHiveTableScanNode(nodeName: String): Boolean = { | ||
nodeName.toLowerCase.startsWith(SCAN_HIVE_LABEL) | ||
} | ||
|
||
def isHiveTableScanNode(node: SparkPlanGraphNode): Boolean = { | ||
isHiveTableScanNode(node.name) | ||
} | ||
|
||
// Given a "scan hive" NodeGraph, construct the MetaData based on the SerDe class. | ||
// If the SerDe class does not match the lookups, it returns an "unknown" format. | ||
def parseReadNode(node: SparkPlanGraphNode): ReadMetaData = { | ||
LOADED_SERDE_CLASSES.find(k => node.desc.contains(k.className)).map( | ||
_.parseReadNode(node)).getOrElse(ReadMetaData("", "HiveTableRelation", "unknown", "unknown")) | ||
} | ||
|
||
// Given a "scan hive" NodeGraph, construct the MetaData of the write operation based on the | ||
// SerDe class. If the SerDe class does not match the lookups, it returns an "unknown" format. | ||
def getWriteFormat(node: SparkPlanGraphNode): String = { | ||
val readMetaData = parseReadNode(node) | ||
readMetaData.format | ||
} | ||
|
||
def isHiveEnabled(properties: collection.Map[String, String]): Boolean = { | ||
EventUtils.isPropertyMatch(properties, "spark.sql.catalogImplementation", "", "hive") | ||
} | ||
|
||
// Keep for future improvement as we can pass this information to the AutoTuner/user to suggest | ||
// recommendations regarding ORC optimizations. | ||
def isORCNativeEnabled(properties: collection.Map[String, String]): Boolean = { | ||
EventUtils.isPropertyMatch(properties, "spark.sql.orc.impl", "native", "native") || | ||
EventUtils.isPropertyMatch(properties, "spark.sql.hive.convertMetastoreOrc", "true", "true") | ||
} | ||
|
||
// Keep for future improvement as we can pass this information to the AutoTuner/user to suggest | ||
// recommendations regarding Parquet optimizations. | ||
def isConvertParquetEnabled(properties: collection.Map[String, String]): Boolean = { | ||
EventUtils.isPropertyMatch(properties, "spark.sql.hive.convertMetastoreParquet", "true", "true") | ||
} | ||
|
||
// Keep for future improvement as we can pass this information to the AutoTuner/user to suggest | ||
// recommendations regarding Text optimizations for GPU. | ||
def isRAPIDSTextHiveEnabled(properties: collection.Map[String, String]): Boolean = { | ||
EventUtils.isPropertyMatch(properties, | ||
"spark.rapids.sql.format.hive.text.enabled", "true", "true") | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.