Merge branch 'main' into ppl-projection-command

# Conflicts: # docs/ppl-lang/PPL-Example-Commands.md # ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4
YANG-DB · Jan 11, 2025 · bf90692 · bf90692
2 parents 2337358 + 974d7d4
commit bf90692
Show file tree

Hide file tree

Showing 30 changed files with 1,972 additions and 104 deletions.
diff --git a/.github/workflows/test-and-build-workflow.yml b/.github/workflows/test-and-build-workflow.yml
@@ -37,7 +37,7 @@ jobs:
 
       - name: Upload test report
         if: always() # Ensures the artifact is saved even if tests fail
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: test-reports
           path: target/test-reports # Adjust this path if necessary
diff --git a/build.sbt b/build.sbt
@@ -68,8 +68,7 @@ val packagesToShade = Seq(
   "org.glassfish.json.**",
   "org.joda.time.**",
   "org.reactivestreams.**",
-  "org.yaml.**",
-  "software.amazon.**"
+  "org.yaml.**"
 )
 
 ThisBuild / assemblyShadeRules := Seq(

diff --git a/docs/index.md b/docs/index.md
@@ -394,6 +394,7 @@ User can provide the following options in `WITH` clause of create statement:
 + `watermark_delay`: a string as time expression for how late data can come and still be processed, e.g. 1 minute, 10 seconds. This is required by auto and incremental refresh on materialized view if it has aggregation in the query.
 + `output_mode`: a mode string that describes how data will be written to streaming sink. If unspecified, default append mode will be applied.
 + `index_settings`: a JSON string as index settings for OpenSearch index that will be created. Please follow the format in OpenSearch documentation. If unspecified, default OpenSearch index settings will be applied.
++ `id_expression`: an expression string that generates an ID column to guarantee idempotency when index refresh job restart or any retry attempt during an index refresh. If an empty string is provided, no ID column will be generated.
 + `extra_options`: a JSON string as extra options that can be passed to Spark streaming source and sink API directly. Use qualified source table name (because there could be multiple) and "sink", e.g. '{"sink": "{key: val}", "table1": {key: val}}'
 
 Note that the index option name is case-sensitive. Here is an example:
@@ -406,6 +407,7 @@ WITH (
   watermark_delay = '1 Second',
   output_mode = 'complete',
   index_settings = '{"number_of_shards": 2, "number_of_replicas": 3}',
+  id_expression = "sha1(concat_ws('\0',startTime,status))",
   extra_options = '{"spark_catalog.default.alb_logs": {"maxFilesPerTrigger": "1"}}'
 )
 ```

diff --git a/docs/ppl-lang/PPL-Example-Commands.md b/docs/ppl-lang/PPL-Example-Commands.md
@@ -1,5 +1,10 @@
 ## Example PPL Queries
 
+#### **AppendCol**
+[See additional command details](ppl-appendcol-command.md)
+- `source=employees | stats avg(age) as avg_age1 by dept | fields dept, avg_age1 | APPENDCOL  [ stats avg(age) as avg_age2 by dept | fields avg_age2 ];` (To display multiple table statistics side by side)
+- `source=employees | FIELDS name, dept, age | APPENDCOL OVERRIDE=true [ stats avg(age) as age ];` (When the override option is enabled, fields from the sub-query take precedence over fields in the main query in cases of field name conflicts)
+
 #### **Comment**
 [See additional command details](ppl-comment.md)
 - `source=accounts | top gender // finds most common gender of all the accounts` (line comment)
@@ -274,7 +279,8 @@ source = table |  where ispresent(a) |
 - `source=accounts | parse email '.+@(?<host>.+)' | stats count() by host`
 - `source=accounts | parse email '.+@(?<host>.+)' | eval eval_result=1 | fields host, eval_result`
 - `source=accounts | parse email '.+@(?<host>.+)' | where age > 45 | sort - age | fields age, email, host`
-- `source=accounts | parse address '(?<streetNumber>\d+) (?<street>.+)' | where streetNumber > 500 | sort num(streetNumber) | fields streetNumber, street`
+- `source=accounts | parse address '(?<streetNumber>\d+) (?<street>.+)' | eval streetNumberInt = cast(streetNumber as integer) | where streetNumberInt > 500 | sort streetNumberInt | fields streetNumber, street`
+- Limitation: [see limitations](ppl-parse-command.md#limitations)
 
 #### **view**
 [See additional command details](ppl-view-command.md)

diff --git a/docs/ppl-lang/README.md b/docs/ppl-lang/README.md
@@ -76,6 +76,8 @@ For additional examples see the next [documentation](PPL-Example-Commands.md).
 
     - [`expand commands`](ppl-expand-command.md)
 
+    - [`appendcol command`](ppl-appendcol-command.md)
+
 * **Functions**
 
     - [`Expressions`](functions/ppl-expressions.md)

diff --git a/docs/ppl-lang/ppl-appendcol-command.md b/docs/ppl-lang/ppl-appendcol-command.md
@@ -0,0 +1,120 @@
+## PPL `appendcol` command
+
+### Description
+Using `appendcol` command to append the result of a sub-search and attach it alongside with the input search results (The main search).
+
+### Syntax - APPENDCOL
+`APPENDCOL <override=?> [sub-search]...`
+
+* <override=?>: optional boolean field to specify should result from main-result be overwritten in the case of column name conflict.
+* sub-search: Executes PPL commands as a secondary search. The sub-search uses the same data specified in the source clause of the main search results as its input.
+
+
+#### Example 1: To append the result of `stats avg(age) as AVG_AGE` into existing search result   
+
+The example append the result of sub-search `stats avg(age) as AVG_AGE` alongside with the main-search.
+
+PPL query:
+
+    os> source=employees | FIELDS name, dept, age | APPENDCOL [ stats avg(age) as AVG_AGE ];
+    fetched rows / total rows = 9/9
+    +------+-------------+-----+------------------+  
+    | name | dept        | age | AVG_AGE          |  
+    +------+-------------+-----+------------------+  
+    | Lisa | Sales       |  35 | 31.2222222222222 |  
+    | Fred | Engineering |  28 | NULL             |  
+    | Paul | Engineering |  23 | NULL             |  
+    | Evan | Sales       |  38 | NULL             |   
+    | Chloe| Engineering |  25 | NULL             |   
+    | Tom  | Engineering |  33 | NULL             |   
+    | Alex | Sales       |  33 | NULL             |  
+    | Jane | Marketing   |  28 | NULL             |  
+    | Jeff | Marketing   |  38 | NULL             |  
+    +------+-------------+-----+------------------+  
+
+
+#### Example 2: To compare multiple stats commands with side by side with appendCol.
+
+This example demonstrates a common use case: performing multiple statistical calculations and displaying the results side by side in a horizontal layout.
+
+PPL query:
+
+    os> source=employees | stats avg(age) as avg_age1 by dept | fields dept, avg_age1 | APPENDCOL  [ stats avg(age) as avg_age2 by dept | fields avg_age2 ];
+    fetched rows / total rows = 3/3
+    +-------------+-----------+----------+
+    | dept        | avg_age1  | avg_age2 |
+    +-------------+-----------+----------+
+    | Engineering | 27.25     |  27.25   |
+    | Sales       | 35.33     |  35.33   |
+    | Marketing   | 33.00     |  33.00   |
+    +-------------+-----------+----------+
+
+
+#### Example 3: Append multiple sub-search result
+
+The example demonstrate multiple APPENCOL commands can be chained to provide one comprehensive view for user. 
+
+PPL query:
+
+    os> source=employees | FIELDS name, dept, age | APPENDCOL [ stats avg(age) as AVG_AGE ] | APPENDCOL [ stats max(age) as MAX_AGE ];
+    fetched rows / total rows = 9/9
+    +------+-------------+-----+------------------+---------+  
+    | name | dept        | age | AVG_AGE          | MAX_AGE |  
+    +------+-------------+-----+------------------+---------+  
+    | Lisa | Sales------ |  35 | 31.22222222222222|      38 |  
+    | Fred | Engineering |  28 | NULL             |    NULL |  
+    | Paul | Engineering |  23 | NULL             |    NULL |  
+    | Evan | Sales------ |  38 | NULL             |    NULL |  
+    | Chloe| Engineering |  25 | NULL             |    NULL |  
+    | Tom  | Engineering |  33 | NULL             |    NULL |  
+    | Alex | Sales       |  33 | NULL             |    NULL |  
+    | Jane | Marketing   |  28 | NULL             |    NULL |  
+    | Jeff | Marketing   |  38 | NULL             |    NULL |  
+    +------+-------------+-----+------------------+---------+  
+
+#### Example 4: Over main-search in the case of column name conflict
+
+The example demonstrate the usage of `OVERRIDE` option to overwrite the `age` column from the main-search, 
+when the option is set to true and column with same name `age` present on sub-search.
+
+PPL query:
+
+    os> source=employees | FIELDS name, dept, age | APPENDCOL OVERRIDE=true [ stats avg(age) as age ];
+    fetched rows / total rows = 9/9
+    +------+-------------+------------------+  
+    | name | dept        | age              |  
+    +------+-------------+------------------+  
+    | Lisa | Sales------ | 31.22222222222222|  
+    | Fred | Engineering | NULL             |  
+    | Paul | Engineering | NULL             |  
+    | Evan | Sales------ | NULL             |  
+    | Chloe| Engineering | NULL             |  
+    | Tom  | Engineering | NULL             |  
+    | Alex | Sales       | NULL             |  
+    | Jane | Marketing   | NULL             |  
+    | Jeff | Marketing   | NULL             |
+    +------+-------------+------------------+
+
+#### Example 5: AppendCol command with duplicated columns
+
+The example demonstrate what could happen when conflicted columns exist, with `override` set to false or absent.
+In this particular case, average aggregation is being performed over column `age` with group-by `dept`, on main and sub query respectively.
+As the result, `dept` and `avg_age1` will be returned by the main query, with `avg_age2` and `dept` for the sub-query,
+and take into consideration `override` is absent, duplicated columns won't be dropped, hence all four columns will be displayed as the final result.
+
+PPL query:
+
+    os> source=employees | stats avg(age) as avg_age1 by dept | APPENDCOL  [ stats avg(age) as avg_age2 by dept ];
+    fetched rows / total rows = 3/3
+    +------------+--------------+------------+--------------+
+    | Avg Age 1  | Dept         | Avg Age 2  | Dept         |
+    +------------+--------------+------------+--------------+
+    |   35.33    | Sales        |   35.33    | Sales        |
+    |   27.25    | Engineering  |   27.25    | Engineering  |
+    |   33.00    | Marketing    |   33.00    | Marketing    |
+    +------------+--------------+------------+--------------+
+
+
+### Limitation: 
+When override is set to true, only `FIELDS` and `STATS` commands are allowed as the final clause in a sub-search. 
+Otherwise, an IllegalStateException with the message `Not Supported operation: APPENDCOL should specify the output fields` will be thrown.
diff --git a/docs/ppl-lang/ppl-parse-command.md b/docs/ppl-lang/ppl-parse-command.md
@@ -58,7 +58,7 @@ The example shows how to sort street numbers that are higher than 500 in ``addre
 
 PPL query:
 
-    os> source=accounts | parse address '(?<streetNumber>\d+) (?<street>.+)' | where cast(streetNumber as int) > 500 | sort num(streetNumber) | fields streetNumber, street ;
+    os> source=accounts | parse address '(?<streetNumber>\d+) (?<street>.+)' | eval streetNumberInt = cast(streetNumber as integer) | where streetNumberInt > 500 | sort streetNumberInt | fields streetNumber, street ;
     fetched rows / total rows = 3/3
     +----------------+----------------+
     | streetNumber   | street         |

diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSparkIndex.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSparkIndex.scala
@@ -11,8 +11,10 @@ import org.opensearch.flint.common.metadata.FlintMetadata
 import org.opensearch.flint.common.metadata.log.FlintMetadataLogEntry
 import org.opensearch.flint.core.metadata.FlintJsonHelper._
 
+import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.flint.datatype.FlintDataType
+import org.apache.spark.sql.functions.expr
 import org.apache.spark.sql.types.StructType
 
 /**
@@ -62,7 +64,7 @@ trait FlintSparkIndex {
   def build(spark: SparkSession, df: Option[DataFrame]): DataFrame
 }
 
-object FlintSparkIndex {
+object FlintSparkIndex extends Logging {
 
   /**
    * Interface indicates a Flint index has custom streaming refresh capability other than foreach
@@ -117,6 +119,25 @@ object FlintSparkIndex {
     s"${parts(0)}.${parts(1)}.`${parts.drop(2).mkString(".")}`"
   }
 
+  /**
+   * Generate an ID column using ID expression provided in the index option.
+   *
+   * @param df
+   *   which DataFrame to generate ID column
+   * @param options
+   *   Flint index options
+   * @return
+   *   DataFrame with/without ID column
+   */
+  def addIdColumn(df: DataFrame, options: FlintSparkIndexOptions): DataFrame = {
+    options.idExpression() match {
+      case Some(idExpr) if idExpr.nonEmpty =>
+        logInfo(s"Using user-provided ID expression: $idExpr")
+        df.withColumn(ID_COLUMN, expr(idExpr))
+      case _ => df
+    }
+  }
+
   /**
    * Populate environment variables to persist in Flint metadata.
    *

diff --git a/...-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSparkIndexOptions.scala b/...-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSparkIndexOptions.scala
@@ -10,7 +10,7 @@ import java.util.{Collections, UUID}
 import org.json4s.{Formats, NoTypeHints}
 import org.json4s.native.JsonMethods._
 import org.json4s.native.Serialization
-import org.opensearch.flint.spark.FlintSparkIndexOptions.OptionName.{AUTO_REFRESH, CHECKPOINT_LOCATION, EXTRA_OPTIONS, INCREMENTAL_REFRESH, INDEX_SETTINGS, OptionName, OUTPUT_MODE, REFRESH_INTERVAL, SCHEDULER_MODE, WATERMARK_DELAY}
+import org.opensearch.flint.spark.FlintSparkIndexOptions.OptionName.{AUTO_REFRESH, CHECKPOINT_LOCATION, EXTRA_OPTIONS, ID_EXPRESSION, INCREMENTAL_REFRESH, INDEX_SETTINGS, OptionName, OUTPUT_MODE, REFRESH_INTERVAL, SCHEDULER_MODE, WATERMARK_DELAY}
 import org.opensearch.flint.spark.FlintSparkIndexOptions.validateOptionNames
 import org.opensearch.flint.spark.refresh.FlintSparkIndexRefresh.SchedulerMode
 import org.opensearch.flint.spark.scheduler.util.IntervalSchedulerParser
@@ -96,6 +96,14 @@ case class FlintSparkIndexOptions(options: Map[String, String]) {
    */
   def indexSettings(): Option[String] = getOptionValue(INDEX_SETTINGS)
 
+  /**
+   * An expression that generates unique value as index data row ID.
+   *
+   * @return
+   *   ID expression
+   */
+  def idExpression(): Option[String] = getOptionValue(ID_EXPRESSION)
+
   /**
    * Extra streaming source options that can be simply passed to DataStreamReader or
    * Relation.options
@@ -187,6 +195,7 @@ object FlintSparkIndexOptions {
     val WATERMARK_DELAY: OptionName.Value = Value("watermark_delay")
     val OUTPUT_MODE: OptionName.Value = Value("output_mode")
     val INDEX_SETTINGS: OptionName.Value = Value("index_settings")
+    val ID_EXPRESSION: OptionName.Value = Value("id_expression")
     val EXTRA_OPTIONS: OptionName.Value = Value("extra_options")
   }
 

diff --git a/...egration/src/main/scala/org/opensearch/flint/spark/covering/FlintSparkCoveringIndex.scala b/...egration/src/main/scala/org/opensearch/flint/spark/covering/FlintSparkCoveringIndex.scala
@@ -10,7 +10,7 @@ import scala.collection.JavaConverters.mapAsJavaMapConverter
 import org.opensearch.flint.common.metadata.FlintMetadata
 import org.opensearch.flint.common.metadata.log.FlintMetadataLogEntry
 import org.opensearch.flint.spark._
-import org.opensearch.flint.spark.FlintSparkIndex.{flintIndexNamePrefix, generateSchema, metadataBuilder, quotedTableName}
+import org.opensearch.flint.spark.FlintSparkIndex.{addIdColumn, flintIndexNamePrefix, generateSchema, metadataBuilder, quotedTableName}
 import org.opensearch.flint.spark.FlintSparkIndexOptions.empty
 import org.opensearch.flint.spark.covering.FlintSparkCoveringIndex.{getFlintIndexName, COVERING_INDEX_TYPE}
 
@@ -71,10 +71,13 @@ case class FlintSparkCoveringIndex(
     val job = df.getOrElse(spark.read.table(quotedTableName(tableName)))
 
     // Add optional filtering condition
-    filterCondition
-      .map(job.where)
-      .getOrElse(job)
-      .select(colNames.head, colNames.tail: _*)
+    val batchDf =
+      filterCondition
+        .map(job.where)
+        .getOrElse(job)
+        .select(colNames.head, colNames.tail: _*)
+
+    addIdColumn(batchDf, options)
   }
 }
 

diff --git a/...integration/src/main/scala/org/opensearch/flint/spark/mv/FlintSparkMaterializedView.scala b/...integration/src/main/scala/org/opensearch/flint/spark/mv/FlintSparkMaterializedView.scala
@@ -13,7 +13,7 @@ import scala.collection.convert.ImplicitConversions.`map AsScala`
 import org.opensearch.flint.common.metadata.FlintMetadata
 import org.opensearch.flint.common.metadata.log.FlintMetadataLogEntry
 import org.opensearch.flint.spark.{FlintSpark, FlintSparkIndex, FlintSparkIndexBuilder, FlintSparkIndexOptions}
-import org.opensearch.flint.spark.FlintSparkIndex.{flintIndexNamePrefix, generateSchema, metadataBuilder, StreamingRefresh}
+import org.opensearch.flint.spark.FlintSparkIndex.{addIdColumn, flintIndexNamePrefix, generateSchema, metadataBuilder, ID_COLUMN, StreamingRefresh}
 import org.opensearch.flint.spark.FlintSparkIndexOptions.empty
 import org.opensearch.flint.spark.function.TumbleFunction
 import org.opensearch.flint.spark.mv.FlintSparkMaterializedView.{getFlintIndexName, MV_INDEX_TYPE}
@@ -81,7 +81,8 @@ case class FlintSparkMaterializedView(
   override def build(spark: SparkSession, df: Option[DataFrame]): DataFrame = {
     require(df.isEmpty, "materialized view doesn't support reading from other data frame")
 
-    spark.sql(query)
+    val batchDf = spark.sql(query)
+    addIdColumn(batchDf, options)
   }
 
   override def buildStream(spark: SparkSession): DataFrame = {
@@ -99,7 +100,9 @@ case class FlintSparkMaterializedView(
       case relation: UnresolvedRelation if !relation.isStreaming =>
         relation.copy(isStreaming = true, options = optionsWithExtra(spark, relation))
     }
-    logicalPlanToDataFrame(spark, streamingPlan)
+
+    val streamingDf = logicalPlanToDataFrame(spark, streamingPlan)
+    addIdColumn(streamingDf, options)
   }
 
   private def watermark(timeCol: Attribute, child: LogicalPlan) = {

diff --git a/flint-spark-integration/src/test/scala/org/apache/spark/FlintSuite.scala b/flint-spark-integration/src/test/scala/org/apache/spark/FlintSuite.scala
@@ -6,9 +6,12 @@
 package org.apache.spark
 
 import org.opensearch.flint.spark.FlintSparkExtensions
+import org.opensearch.flint.spark.FlintSparkIndex.ID_COLUMN
 
-import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.catalyst.expressions.{Alias, CodegenObjectFactoryMode, Expression}
 import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation
+import org.apache.spark.sql.catalyst.plans.logical.Project
 import org.apache.spark.sql.flint.config.{FlintConfigEntry, FlintSparkConf}
 import org.apache.spark.sql.flint.config.FlintSparkConf.{EXTERNAL_SCHEDULER_ENABLED, HYBRID_SCAN_ENABLED, METADATA_CACHE_WRITE}
 import org.apache.spark.sql.internal.SQLConf
@@ -68,4 +71,27 @@ trait FlintSuite extends SharedSparkSession {
       setFlintSparkConf(METADATA_CACHE_WRITE, "false")
     }
   }
+
+  /**
+   * Implicit class to extend DataFrame functionality with additional utilities.
+   *
+   * @param df
+   *   the DataFrame to which the additional methods are added
+   */
+  protected implicit class DataFrameExtensions(val df: DataFrame) {
+
+    /**
+     * Retrieves the ID column expression from the logical plan of the DataFrame, if it exists.
+     *
+     * @return
+     *   an `Option` containing the `Expression` for the ID column if present, or `None` otherwise
+     */
+    def idColumn(): Option[Expression] = {
+      df.queryExecution.logical.collectFirst { case Project(projectList, _) =>
+        projectList.collectFirst { case Alias(child, ID_COLUMN) =>
+          child
+        }
+      }.flatten
+    }
+  }
 }