diff --git a/docs/ppl-lang/PPL-Example-Commands.md b/docs/ppl-lang/PPL-Example-Commands.md index 26ddd3613..3d917ae35 100644 --- a/docs/ppl-lang/PPL-Example-Commands.md +++ b/docs/ppl-lang/PPL-Example-Commands.md @@ -118,6 +118,7 @@ Assumptions: `a`, `b`, `c` are existing fields in `table` - `source = table | eval r = coalesce(a, b, c) | fields r` - `source = table | eval e = isempty(a) | fields e` - `source = table | eval e = isblank(a) | fields e` +- `source = table | eval e = cast(a as timestamp) | fields e` - `source = table | eval f = case(a = 0, 'zero', a = 1, 'one', a = 2, 'two', a = 3, 'three', a = 4, 'four', a = 5, 'five', a = 6, 'six', a = 7, 'se7en', a = 8, 'eight', a = 9, 'nine')` - `source = table | eval f = case(a = 0, 'zero', a = 1, 'one' else 'unknown')` - `source = table | eval f = case(a = 0, 'zero', a = 1, 'one' else concat(a, ' is an incorrect binary digit'))` @@ -486,4 +487,11 @@ _- **Limitation: another command usage of (relation) subquery is in `appendcols` > ppl-correlation-command is an experimental command - it may be removed in future versions +#### **Cast** +[See additional command details](functions/ppl-conversion.md) +- `source = table | eval int_to_string = cast(1 as string) | fields int_to_string` +- `source = table | eval int_to_string = cast(int_col as string) | fields int_col, int_to_string` +- `source = table | eval cdate = CAST('2012-08-07' as date), ctime = cast('2012-08-07T08:07:06' as timestamp) | fields cdate, ctime` +- `source = table | eval chained_cast = cast(cast("true" as boolean) as int) | fields chained_cast` + --- diff --git a/docs/ppl-lang/functions/ppl-conversion.md b/docs/ppl-lang/functions/ppl-conversion.md index b1f35eb1b..6dc5e8688 100644 --- a/docs/ppl-lang/functions/ppl-conversion.md +++ b/docs/ppl-lang/functions/ppl-conversion.md @@ -7,23 +7,26 @@ `cast(expr as dateType)` cast the expr to dataType. return the value of dataType. The following conversion rules are used: ``` -+------------+--------+--------+---------+-------------+--------+--------+ -| Src/Target | STRING | NUMBER | BOOLEAN | TIMESTAMP | DATE | TIME | -+------------+--------+--------+---------+-------------+--------+--------+ -| STRING | | Note1 | Note1 | TIMESTAMP() | DATE() | TIME() | -+------------+--------+--------+---------+-------------+--------+--------+ -| NUMBER | Note1 | | v!=0 | N/A | N/A | N/A | -+------------+--------+--------+---------+-------------+--------+--------+ -| BOOLEAN | Note1 | v?1:0 | | N/A | N/A | N/A | -+------------+--------+--------+---------+-------------+--------+--------+ -| TIMESTAMP | Note1 | N/A | N/A | | DATE() | N/A | -+------------+--------+--------+---------+-------------+--------+--------+ -| DATE | Note1 | N/A | N/A | N/A | | N/A | -+------------+--------+--------+---------+-------------+--------+--------+ -| TIME | Note1 | N/A | N/A | N/A | N/A | | -+------------+--------+--------+---------+-------------+--------+--------+ ++------------+--------+--------+---------+-------------+--------+ +| Src/Target | STRING | NUMBER | BOOLEAN | TIMESTAMP | DATE | ++------------+--------+--------+---------+-------------+--------+ +| STRING | | Note1 | Note1 | TIMESTAMP() | DATE() | ++------------+--------+--------+---------+-------------+--------+ +| NUMBER | Note1 | | v!=0 | N/A | N/A | ++------------+--------+--------+---------+-------------+--------+ +| BOOLEAN | Note1 | v?1:0 | | N/A | N/A | ++------------+--------+--------+---------+-------------+--------+ +| TIMESTAMP | Note1 | N/A | N/A | | DATE() | ++------------+--------+--------+---------+-------------+--------+ +| DATE | Note1 | N/A | N/A | N/A | | ++------------+--------+--------+---------+-------------+--------+ +| TIME | Note1 | N/A | N/A | N/A | N/A | ++------------+--------+--------+---------+-------------+--------+ ``` -Note: Spark does not support the `TIME` type. Using the `CAST` function will convert it to **STRING**. +- `NUMBER` includes `INTEGER` (alias `INT`), `LONG`, `FLOAT`, `DOUBLE`. + +- `BOOLEAN` has the alias `BOOL`. + Cast to **string** example: @@ -47,13 +50,13 @@ Cast to **number** example: Cast to **date** example: - os> source=people | eval `cdate` = CAST('2012-08-07' as date), `ctime` = CAST('01:01:01' as time), `ctimestamp` = CAST('2012-08-07 01:01:01' as timestamp) | fields `cdate`, `ctime`, `ctimestamp` + os> source=people | eval `cdate` = CAST('2012-08-07' as date), `ctimestamp` = CAST('2012-08-07 01:01:01' as timestamp) | fields `cdate`, `ctimestamp` fetched rows / total rows = 1/1 - +------------+----------+---------------------+ - | cdate | ctime | ctimestamp | - |------------+----------+---------------------| - | 2012-08-07 | 01:01:01 | 2012-08-07 01:01:01 | - +------------+----------+---------------------+ + +------------+---------------------+ + | cdate | ctimestamp | + |------------+---------------------| + | 2012-08-07 | 2012-08-07 01:01:01 | + +------------+---------------------+ Cast function can be **chained**: diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLCastITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLCastITSuite.scala index aa37d6818..9aa26bbab 100644 --- a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLCastITSuite.scala +++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLCastITSuite.scala @@ -63,20 +63,18 @@ class FlintSparkPPLCastITSuite | cast_true = cast("True" as boolean), | cast_false = cast("false" as boolean), | cast_timestamp = cast("2024-11-26 23:39:06" as timestamp), - | cast_date = cast("2024-11-26" as date), - | cast_time = cast("12:34:56" as time) - | | fields id_int, cast_true, cast_false, cast_timestamp, cast_date, cast_time | head 1 + | cast_date = cast("2024-11-26" as date) + | | fields id_int, cast_true, cast_false, cast_timestamp, cast_date | head 1 | """.stripMargin) - // Note: Spark doesn't support data type of `Time`, cast it to StringTypes by default. assert( - frame.dtypes.sameElements(Array( - ("id_int", "IntegerType"), - ("cast_true", "BooleanType"), - ("cast_false", "BooleanType"), - ("cast_timestamp", "TimestampType"), - ("cast_date", "DateType"), - ("cast_time", "StringType")))) + frame.dtypes.sameElements( + Array( + ("id_int", "IntegerType"), + ("cast_true", "BooleanType"), + ("cast_false", "BooleanType"), + ("cast_timestamp", "TimestampType"), + ("cast_date", "DateType")))) assertSameRows( Seq( Row( @@ -84,8 +82,7 @@ class FlintSparkPPLCastITSuite true, false, Timestamp.valueOf("2024-11-26 23:39:06"), - Date.valueOf("2024-11-26"), - "12:34:56")), + Date.valueOf("2024-11-26"))), frame) } diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/DataType.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/DataType.java index fb3cbb598..802b72f78 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/DataType.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/DataType.java @@ -37,6 +37,8 @@ public static DataType fromString(String name) { switch (upperName) { case "INT": return INTEGER; + case "BOOL": + return BOOLEAN; default: return valueOf(upperName); } diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/DataTypeTransformer.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/DataTypeTransformer.java index 5d45b2357..f583d7847 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/DataTypeTransformer.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/DataTypeTransformer.java @@ -54,6 +54,8 @@ static DataType translate(org.opensearch.sql.ast.expression.DataType source) { return DateType$.MODULE$; case TIMESTAMP: return DataTypes.TimestampType; + case STRING: + return DataTypes.StringType; case INTEGER: return IntegerType$.MODULE$; case LONG: @@ -71,7 +73,7 @@ static DataType translate(org.opensearch.sql.ast.expression.DataType source) { case UNDEFINED: return NullType$.MODULE$; default: - return StringType$.MODULE$; + throw new IllegalArgumentException("Unsupported data type for Spark: " + source); } } diff --git a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanCastTestSuite.scala b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanCastTestSuite.scala index 569ea4484..809800aa5 100644 --- a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanCastTestSuite.scala +++ b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanCastTestSuite.scala @@ -108,6 +108,7 @@ class PPLLogicalPlanCastTestSuite } test("test cast with unsupported dataType") { + // Unsupported data type for opensearch parser val context = new CatalystPlanContext val exception = intercept[SyntaxCheckException] { planTransformer.visit( @@ -117,6 +118,13 @@ class PPLLogicalPlanCastTestSuite assert( exception.getMessage.contains( "Failed to parse query due to offending symbol [UNSUPPORTED_DATATYPE]")) + + // Unsupported data type for Spark + val context2 = new CatalystPlanContext + val exception2 = intercept[IllegalArgumentException] { + planTransformer.visit(plan(pplParser, """source=t | eval a = cast(a as time)"""), context2) + } + assert(exception2.getMessage == "Unsupported data type for Spark: TIME") } }