Add OpenLineage reporting support for Spark connector

apache · Aug 26, 2024 · 164f9ac · 164f9ac
1 parent 307607c
commit 164f9ac
Show file tree

Hide file tree

Showing 6 changed files with 264 additions and 4 deletions.
diff --git a/pom.xml b/pom.xml
@@ -173,6 +173,7 @@
     <scoverage.version>1.4.11</scoverage.version>
     <sbt-compiler.version>1.0.0</sbt-compiler.version>
     <jacoco.version>0.8.8</jacoco.version>
+    <openlineage.version>1.20.0</openlineage.version>
   </properties>
   <dependencyManagement>
     <dependencies>
@@ -243,6 +244,17 @@
         <artifactId>audience-annotations</artifactId>
         <version>${audience-annotations.version}</version>
       </dependency>
+      <dependency>
+        <groupId>io.openlineage</groupId>
+        <artifactId>spark-extension-interfaces</artifactId>
+        <version>${openlineage.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>io.openlineage</groupId>
+        <artifactId>spark-extension-entrypoint</artifactId>
+        <version>1.0.0</version>
+        <scope>provided</scope>
+      </dependency>
       <dependency>
         <groupId>org.apache.hbase</groupId>
         <artifactId>hbase-annotations</artifactId>
@@ -387,7 +399,7 @@
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>
           <artifactId>maven-shade-plugin</artifactId>
-          <version>3.2.1</version>
+          <version>3.4.1</version>
         </plugin>
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>

diff --git a/spark/hbase-spark/pom.xml b/spark/hbase-spark/pom.xml
@@ -182,6 +182,21 @@
       <artifactId>spark-catalyst_${scala.binary.version}</artifactId>
       <version>${spark.version}</version>
     </dependency>
+    <dependency>
+      <groupId>io.openlineage</groupId>
+      <artifactId>spark-extension-interfaces</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>io.openlineage</groupId>
+      <artifactId>spark-extension-entrypoint</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>io.openlineage</groupId>
+      <artifactId>openlineage-spark_2.12</artifactId>
+      <version>${openlineage.version}</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 
   <build>
@@ -265,6 +280,26 @@
           </rules>
         </configuration>
       </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-shade-plugin</artifactId>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>shade</goal>
+            </goals>
+            <configuration>
+              <relocations>
+                <relocation>
+                  <pattern>io.openlineage.spark.shade</pattern>
+                  <shadedPattern>org.apache.hbase.thirdparty.io.openlineage.spark.shade</shadedPattern>
+                </relocation>
+              </relocations>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
     </plugins>
   </build>
 

diff --git a/...n/resources/META-INF/services/io.openlineage.spark.extension.OpenLineageExtensionProvider b/...n/resources/META-INF/services/io.openlineage.spark.extension.OpenLineageExtensionProvider
@@ -0,0 +1 @@
+org.apache.hadoop.hbase.spark.SparkHBaseLineageProvider
diff --git a/spark/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/DefaultSource.scala b/spark/hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/DefaultSource.scala
@@ -17,6 +17,10 @@
  */
 package org.apache.hadoop.hbase.spark
 
+import io.openlineage.spark.shade.client.OpenLineage
+import io.openlineage.spark.shade.client.utils.DatasetIdentifier
+import io.openlineage.spark.shade.extension.v1.{LineageRelation, LineageRelationProvider}
+
 import java.util
 import java.util.concurrent.ConcurrentLinkedQueue
 import org.apache.hadoop.hbase.CellUtil
@@ -32,10 +36,11 @@ import org.apache.hadoop.hbase.types._
 import org.apache.hadoop.hbase.util.{Bytes, PositionedByteRange, SimplePositionedMutableByteRange}
 import org.apache.hadoop.mapred.JobConf
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Row, SaveMode, SQLContext}
+import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.yetus.audience.InterfaceAudience
+
 import scala.collection.mutable
 
 /**
@@ -53,7 +58,7 @@ import scala.collection.mutable
  *   Through the HBase Bytes object commands.
  */
 @InterfaceAudience.Private
-class DefaultSource extends RelationProvider with CreatableRelationProvider with Logging {
+class DefaultSource extends RelationProvider with CreatableRelationProvider with Logging with LineageRelationProvider {
 
   /**
    * Is given input from SparkSQL to construct a BaseRelation
@@ -78,6 +83,15 @@ class DefaultSource extends RelationProvider with CreatableRelationProvider with
     relation.insert(data, false)
     relation
   }
+
+  def getLineageDatasetIdentifier(sparkListenerEventName: String, openLineage: OpenLineage, sqlContext: Any, parameters: Any): DatasetIdentifier = {
+    val params = parameters.asInstanceOf[Map[String, String]]
+    val hbaseContext = LatestHBaseContextCache.latest
+    val catalog = HBaseTableCatalog(params)
+    val name = s"${catalog.namespace}.${catalog.name}"
+    val namespace = s"hbase://${hbaseContext.config.get("hbase.zookeeper.quorum")}"
+    new DatasetIdentifier(name, namespace)
+  }
 }
 
 /**
@@ -93,7 +107,8 @@ case class HBaseRelation(
     extends BaseRelation
     with PrunedFilteredScan
     with InsertableRelation
-    with Logging {
+    with Logging
+    with LineageRelation {
   val timestamp = parameters.get(HBaseSparkConf.TIMESTAMP).map(_.toLong)
   val minTimestamp = parameters.get(HBaseSparkConf.TIMERANGE_START).map(_.toLong)
   val maxTimestamp = parameters.get(HBaseSparkConf.TIMERANGE_END).map(_.toLong)
@@ -611,6 +626,12 @@ case class HBaseRelation(
         new PassThroughLogicExpression
     }
   }
+
+  def getLineageDatasetIdentifier(sparkListenerEventName: String, openLineage: OpenLineage): DatasetIdentifier = {
+    val namespace = s"hbase://${this.hbaseConf.get("hbase.zookeeper.quorum")}"
+    val name = s"${this.catalog.namespace}.${this.catalog.name}"
+    new DatasetIdentifier(name, namespace)
+  }
 }
 
 /**

diff --git a/.../hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/SparkHBaseLineageProvider.scala b/.../hbase-spark/src/main/scala/org/apache/hadoop/hbase/spark/SparkHBaseLineageProvider.scala
@@ -0,0 +1,12 @@
+package org.apache.hadoop.hbase.spark
+
+import io.openlineage.spark.extension.OpenLineageExtensionProvider
+import io.openlineage.spark.shade.extension.v1.lifecycle.plan.SparkOpenLineageExtensionVisitor
+
+class SparkHBaseLineageProvider extends OpenLineageExtensionProvider {
+
+  def shadedPackage(): String =
+    "org.apache.hbase.thirdparty.io.openlineage.spark.shade"
+
+  override def getVisitorClassName: String = classOf[SparkOpenLineageExtensionVisitor].getCanonicalName
+}
diff --git a/spark/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/OpenLineageSuite.scala b/spark/hbase-spark/src/test/scala/org/apache/hadoop/hbase/spark/OpenLineageSuite.scala
@@ -0,0 +1,179 @@
+package org.apache.hadoop.hbase.spark
+
+import io.openlineage.spark.agent.OpenLineageSparkListener
+import org.apache.hadoop.hbase.client.{ConnectionFactory, Put}
+import org.apache.hadoop.hbase.spark.datasources.{HBaseSparkConf, HBaseTableCatalog}
+import org.apache.hadoop.hbase.util.Bytes
+import org.apache.hadoop.hbase.{HBaseTestingUtility, TableName}
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{SQLContext, SparkSession}
+import org.json4s._
+import org.json4s.jackson.JsonMethods._
+import org.scalatest.Matchers.convertToAnyShouldWrapper
+import org.scalatest.concurrent.Eventually
+import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}
+
+import java.io.File
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+
+class OpenLineageSuite extends FunSuite with Eventually with BeforeAndAfterEach with BeforeAndAfterAll with Logging {
+  @transient var sc: SparkSession = null
+  var TEST_UTIL: HBaseTestingUtility = new HBaseTestingUtility
+
+  val t1TableName = "t1"
+  val t2TableName = "t2"
+  val columnFamily = "c"
+  var sqlContext: SQLContext = null
+
+  val timestamp = 1234567890000L
+  val lineageFile = File.createTempFile(s"openlineage_test_${System.nanoTime()}", ".log")
+
+  override def beforeAll() {
+
+    TEST_UTIL.startMiniCluster
+
+    logInfo(" - minicluster started")
+    try
+      TEST_UTIL.deleteTable(TableName.valueOf(t1TableName))
+    catch {
+      case e: Exception => logInfo(" - no table " + t1TableName + " found")
+    }
+    try
+      TEST_UTIL.deleteTable(TableName.valueOf(t2TableName))
+    catch {
+      case e: Exception => logInfo(" - no table " + t2TableName + " found")
+    }
+
+    logInfo(" - creating table " + t1TableName)
+    TEST_UTIL.createTable(TableName.valueOf(t1TableName), Bytes.toBytes(columnFamily))
+    logInfo(" - created table")
+    logInfo(" - creating table " + t2TableName)
+    TEST_UTIL.createTable(TableName.valueOf(t2TableName), Bytes.toBytes(columnFamily))
+    logInfo(" - created table")
+
+    val sparkConf = new SparkConf
+    sparkConf.set(HBaseSparkConf.QUERY_CACHEBLOCKS, "true")
+    sparkConf.set(HBaseSparkConf.QUERY_BATCHSIZE, "100")
+    sparkConf.set(HBaseSparkConf.QUERY_CACHEDROWS, "100")
+    sparkConf.set("spark.extraListeners", classOf[OpenLineageSparkListener].getCanonicalName)
+    sparkConf.set("spark.openlineage.transport.type", "file")
+    sparkConf.set("spark.openlineage.transport.location", lineageFile.getAbsolutePath)
+
+    sc = SparkSession.builder().master("local").appName("openlineage-test").config(sparkConf).getOrCreate();
+    val connection = ConnectionFactory.createConnection(TEST_UTIL.getConfiguration)
+    try {
+      val t1Table = connection.getTable(TableName.valueOf(t1TableName))
+
+      try {
+        var put = new Put(Bytes.toBytes("get1"))
+        put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo1"))
+        put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("b"), Bytes.toBytes("1"))
+        put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("i"), Bytes.toBytes(1))
+        t1Table.put(put)
+        put = new Put(Bytes.toBytes("get2"))
+        put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo2"))
+        put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("b"), Bytes.toBytes("4"))
+        put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("i"), Bytes.toBytes(4))
+        put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("z"), Bytes.toBytes("FOO"))
+        t1Table.put(put)
+        put = new Put(Bytes.toBytes("get3"))
+        put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo3"))
+        put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("b"), Bytes.toBytes("8"))
+        put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("i"), Bytes.toBytes(8))
+        t1Table.put(put)
+        put = new Put(Bytes.toBytes("get4"))
+        put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo4"))
+        put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("b"), Bytes.toBytes("10"))
+        put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("i"), Bytes.toBytes(10))
+        put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("z"), Bytes.toBytes("BAR"))
+        t1Table.put(put)
+        put = new Put(Bytes.toBytes("get5"))
+        put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("a"), Bytes.toBytes("foo5"))
+        put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("b"), Bytes.toBytes("8"))
+        put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("i"), Bytes.toBytes(8))
+        t1Table.put(put)
+      } finally {
+        t1Table.close()
+      }
+    } finally {
+      connection.close()
+    }
+
+    new HBaseContext(sc.sparkContext, TEST_UTIL.getConfiguration)
+  }
+
+  override def afterAll() {
+    TEST_UTIL.deleteTable(TableName.valueOf(t1TableName))
+    logInfo("shuting down minicluster")
+    TEST_UTIL.shutdownMiniCluster()
+
+    sc.stop()
+  }
+
+  override def beforeEach(): Unit = {
+    DefaultSourceStaticUtils.lastFiveExecutionRules.clear()
+  }
+
+  test("Test rowKey point only rowKey query") {
+    val hbaseTable1Catalog =
+      s"""{
+         |"table":{"namespace":"default", "name":"t1"},
+         |"rowkey":"key",
+         |"columns":{
+         |"KEY_FIELD":{"cf":"rowkey", "col":"key", "type":"string"},
+         |"A_FIELD":{"cf":"c", "col":"a", "type":"string"},
+         |"B_FIELD":{"cf":"c", "col":"b", "type":"string"}
+         |}
+         |}""".stripMargin
+
+    val hbaseTable2Catalog =
+      s"""{
+         |"table":{"namespace":"default", "name":"t2"},
+         |"rowkey":"key",
+         |"columns":{
+         |"KEY_FIELD":{"cf":"rowkey", "col":"key", "type":"string"},
+         |"OUTPUT_A_FIELD":{"cf":"c", "col":"a", "type":"string"},
+         |"OUTPUT_B_FIELD":{"cf":"c", "col":"b", "type":"string"}
+         |}
+         |}""".stripMargin
+
+    val results = sc.read
+      .options(Map(HBaseTableCatalog.tableCatalog -> hbaseTable1Catalog))
+      .format("org.apache.hadoop.hbase.spark")
+      .load()
+
+    results.createOrReplaceTempView("tempview");
+
+    val outputDf = sc.sql("SELECT KEY_FIELD, A_FIELD AS OUTPUT_A_FIELD, B_FIELD AS OUTPUT_B_FIELD FROM tempview")
+
+    outputDf.write
+      .format("org.apache.hadoop.hbase.spark")
+      .options(Map(HBaseTableCatalog.tableCatalog -> hbaseTable2Catalog))
+      .save()
+
+    val events = eventually { val eventLog = parseEventLog(lineageFile); eventLog.size shouldBe 1; eventLog }
+
+    val json = events.head
+    assert(((json \\ "inputs")(0) \ "name") == JString("default.t1"))
+    assert(((json \\ "inputs")(0) \ "namespace") == JString("hbase://127.0.0.1"))
+    assert(((json \\ "outputs")(0) \ "name") == JString("default.t2"))
+    assert(((json \\ "outputs")(0) \ "namespace") == JString("hbase://127.0.0.1"))
+  }
+
+  def parseEventLog(file: File): List[JValue] = {
+    val source = Source.fromFile(file)
+    val eventlist = ArrayBuffer.empty[JValue]
+    for (line <- source.getLines()) {
+      val event = parse(line)
+      for {
+        JObject(child) <- event
+        JField("inputs", JArray(inputs)) <- child
+        JField("outputs", JArray(outputs)) <- child
+        JField("eventType", JString(eventType)) <- child
+        if outputs.nonEmpty && inputs.nonEmpty && eventType == "COMPLETE"
+      } yield eventlist += event
+    }
+    eventlist.toList
+  }
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		org.apache.hadoop.hbase.spark.SparkHBaseLineageProvider