diff --git a/src/main/java/com/snowflake/snowpark_java/Functions.java b/src/main/java/com/snowflake/snowpark_java/Functions.java index ce790653..b04a1b27 100644 --- a/src/main/java/com/snowflake/snowpark_java/Functions.java +++ b/src/main/java/com/snowflake/snowpark_java/Functions.java @@ -4641,6 +4641,212 @@ public static Column format_number(Column x, Integer d) { return new Column(com.snowflake.snowpark.functions.format_number(x.toScalaColumn(), d)); } + /** + * This leverages JSON_EXTRACT_PATH_TEXT and improves functionality by allowing multiple columns + * in a single call, whereas JSON_EXTRACT_PATH_TEXT must be called once for every column. + * + *
NOTE: + * + *
Usage: + *
{@code + * { + * df = session.createDataFrame(Seq(("CR", "{\"id\": 5, + * \"name\": \"Jose\", \"age\": 29}"))) + * .toDF(Seq("nationality", "json_string")) + * } + * When the result of this function is the only part of + * the select statement, no changes are needed + * df.select(json_tuple(col("json_string"), "id", "name", "age")).show() + * ---------------------- + * |"C0" |"C1" |"C2" | + * ---------------------- + * |5 |Jose |29 | + * ---------------------- + * + * However, when specifying multiple columns, an expression like this is required: + * + * df.select( + * col("nationality") + * , json_tuple(col("json_string"), "id", "name", "age"):_* // Notice the :_* syntax. + * ).show() + * + * + * + * ------------------------------------------------- + * |"NATIONALITY" |"C0" |"C1" |"C2" |"C3" | + * ------------------------------------------------- + * |CR |5 |Jose |29 |Mobilize | + * ------------------------------------------------- + * }+ * + * @since 1.15.0 + * @param json Column containing the JSON string text. + * @param fields Fields to pull from the JSON file. + * @return seqToList[] sequence with the specified strings. + */ + public static List
Example + * + *
{@code + * SELECT x, cbrt(x) FROM tab; + * + * --------+-------------+ + * x | cbrt(x) | + * --------+-------------+ + * 0 | 0 | + * 2 | 1.25992105 | + * -10 | -2.15443469 | + * [NULL] | [NULL] | + * --------+-------------+ + * }+ * + * @since 1.15.0 + * @param x Column to calculate the cubic root. + * @return Column object. + */ + public static Column cbrt(Column x) { + return new Column(com.snowflake.snowpark.functions.cbrt(x.toScalaColumn())); + } + + /** + * Used to calculate the cubic root of a number. There were slight differences found: + * + *
Example + * + *
{@code + * SELECT x, cbrt(x) FROM tab; + * + * --------+-------------+ + * x | cbrt(x) | + * --------+-------------+ + * 0 | 0 | + * 2 | 1.25992105 | + * -10 | -2.15443469 | + * [NULL] | [NULL] | + * --------+-------------+ + * }+ * + * @since 1.15.0 + * @param columnName as a stringto calculate the cubic root. + * @return Column object. + */ + public static Column cbrt(String columnName) { + return new Column(functions.cbrt(columnName)); + } + + /** + * This function converts a JSON string to a variant in Snowflake. + * + *
In Snowflake the values are converted automatically, however they're converted as variants, + * meaning that the printSchema function would return different datatypes. To convert the datatype + * and it to be printed as the expected datatype, it should be read on the + * + *
Example + * + *
{@code + * selectExpr function as "json['relative']['age']::integer" + * val data_for_json = Seq( + * (1, "{\"id\": 172319, \"age\": 41, \"relative\": {\"id\": 885471, \"age\": 29}}") + * (2, "{\"id\": 532161, \"age\": 17, \"relative\":{\"id\": 873513, \"age\": 47}}") + * ) + * val data_for_json_column = Seq("col1", "col2") + * val df_for_json = session.createDataFrame(data_for_json).toDF(data_for_json_column) + * + * val json_df = df_for_json.select( + * from_json(col("col2")).as("json") + * ) + * + * json_df.selectExpr( + * "json['id']::integer as id" + * , "json['age']::integer as age" + * , "json['relative']['id']::integer as rel_id" + * , "json['relative']['age']::integer as rel_age" + * ).show(10, 10000) + * ----------------------------------------- + * |"ID" |"AGE" |"REL_ID" |"REL_AGE" | + * ----------------------------------------- + * |172319 |41 |885471 |29 | + * |532161 |17 |873513 |47 | + * ----------------------------------------- + * }+ * + * @since 1.15.0 + * @param e String column to convert to variant. + * @return Column object. + */ + public static Column from_json(Column e) { + return new Column(functions.from_json(e.toScalaColumn())); + } + /** + * Returns the value of sourceExpr cast to data type targetType if possible, or NULL if not + * possible. + * + *
Example:: + * + *
{@code + * df = session.create_dataframe(['0.12', 'USD 27.90', + * '13.97 USD', '97.0', '17,-'], schema=["a"]) + * df.select(try_cast(col("a"), FloatType()).as_('ans')).collect() + * [Row(ANS=0.12), Row(ANS=None), Row(ANS=None), Row(ANS=None), Row(ANS=None)] + * }+ * + * @since 1.15.0 + * @param e Any castable expression + * @param targetType The type of the result + * @return The result is of type targetType. special version of CAST for a subset of datatype + * conversions. It performs the same operation (i.e. converts a value of one data type into + * another data type), but returns a NULL value instead of raising an error when the + * conversion can not be performed. The column argument must be a string column in Snowflake. + */ + public static Column try_cast(Column e, DataType targetType) { + return e.cast(targetType); + } + /** + * This function receives a date or timestamp, as well as a properly formatted string and + * subtracts the specified amount of days from it. If receiving a string, this string is casted to + * date using try_cast and if it's not possible to cast, returns null. If receiving a timestamp it + * will be casted to date (removing its time). Example:: + * + *
>>> from snowflake.snowpark.functions import date_sub, to_date >>> df = + * session.createDataFrame([("1976-01-06")], ["date"]) >>> df = df.withColumn("date", + * to_date("date")) >>> df.withColumn("date", date_sub("date", 2)).show() -------------- |"DATE" | + * -------------- |1976-01-04 | -------------- """ + * + * @since 1.15.0 + * @param start Date, Timestamp or String column to subtract days from. + * @param days Days to subtract. + * @return Column object. + */ + public static Column date_sub(Column start, Integer days) { + return new Column(functions.date_sub(start.toScalaColumn(), days)); + } /* Returns a Column expression with values sorted in descending order. * *
Example: order column values in descending diff --git a/src/main/scala/com/snowflake/snowpark/functions.scala b/src/main/scala/com/snowflake/snowpark/functions.scala index 264b9ffe..81db9a50 100644 --- a/src/main/scala/com/snowflake/snowpark/functions.scala +++ b/src/main/scala/com/snowflake/snowpark/functions.scala @@ -3887,6 +3887,256 @@ object functions { builtin("TO_VARCHAR")(x, if (d > 0) s"999,999.${"0" * d}" else "999,999") } } + + /** + * This leverages JSON_EXTRACT_PATH_TEXT and improves functionality by allowing multiple columns + * in a single call, whereas JSON_EXTRACT_PATH_TEXT must be called once for every column. + * + * NOTE: + *
+ * ---------------------- + * |"C0" |"C1" |"C2" | + * ---------------------- + * |5 |Jose |29 | + * ---------------------- + *+ * However, when specifying multiple columns, an expression like this is required: + *
+ * df.select( + * col("nationality") + * , json_tuple(col("json_string"), "id", "name", "age"):_* // Notice the :_* syntax. + * ).show() + *+ * + *
+ * ------------------------------------------------- + * |"NATIONALITY" |"C0" |"C1" |"C2" |"C3" | + * ------------------------------------------------- + * |CR |5 |Jose |29 |Mobilize | + * ------------------------------------------------- + *+ * @since 1.15.0 + * @param json Column containing the JSON string text. + * @param fields Fields to pull from the JSON file. + * @return Column sequence with the specified strings. + */ + def json_tuple(json: Column, fields: Column*): Seq[Column] = { + var i = -1 + fields.map(f => { + i += 1 + builtin("JSON_EXTRACT_PATH_TEXT")(json, f).as(s"c$i") + }) + } + + /** + * This leverages JSON_EXTRACT_PATH_TEXT and improves functionality by allowing multiple columns + * in a single call, whereas JSON_EXTRACT_PATH_TEXT must be called once for every column. + * + * NOTE: + *
+ * ---------------------- + * |"C0" |"C1" |"C2" | + * ---------------------- + * |5 |Jose |29 | + * ---------------------- + *+ * However, when specifying multiple columns, an expression like this is required: + *
+ * df.select( + * col("nationality") + * , json_tuple(col("json_string"), "id", "name", "age"):_* // Notice the :_* syntax. + * ).show() + *+ * + *
+ * ------------------------------------------------- + * |"NATIONALITY" |"C0" |"C1" |"C2" |"C3" | + * ------------------------------------------------- + * |CR |5 |Jose |29 |Mobilize | + * ------------------------------------------------- + *+ * @since 1.15.0 + * @param json Column containing the JSON string text. + * @param fields Fields to pull from the JSON file. + * @return Column sequence with the specified strings. + */ + def json_tuple(json: String, fields: String*): Seq[Column] = { + var i = -1 + fields.map(f => { + i += 1 + builtin("JSON_EXTRACT_PATH_TEXT")(Column(json), Column(f)).as(s"c$i") + }) + } + + /** + * Used to calculate the cubic root of a number. + * Example + * SELECT x, cbrt(x) FROM tab; + * + * --------+-------------+ + * x | cbrt(x) | + * --------+-------------+ + * 0 | 0 | + * 2 | 1.25992105 | + * -10 | -2.15443469 | + * [NULL] | [NULL] | + * --------+-------------+ + * + * @since 1.15.0 + * @param column Column to calculate the cubic root. + * @return Column object. + */ + def cbrt(e: Column): Column = { + builtin("CBRT")(e) + } + + /** + * Used to calculate the cubic root of a number. There were slight differences found: + * Example + * SELECT x, cbrt(x) FROM tab; + * + * --------+-------------+ + * x | cbrt(x) | + * --------+-------------+ + * 0 | 0 | + * 2 | 1.25992105 | + * -10 | -2.15443469 | + * [NULL] | [NULL] | + * --------+-------------+ + * + * @since 1.15.0 + * @param column Column to calculate the cubic root. + * @return Column object. + */ + def cbrt(columnName: String): Column = { + cbrt(col(columnName)) + } + + /** + * This function converts a JSON string to a variant in Snowflake. + * + * In Snowflake the values are converted automatically, however they're converted as variants, + * meaning that the printSchema + * function would return different datatypes. + * To convert the datatype and it to be printed as the expected datatype, + * it should be read on the + * selectExpr function as "json['relative']['age']::integer" + * val data_for_json = Seq( + * (1, "{\"id\": 172319, \"age\": 41, \"relative\": {\"id\": 885471, \"age\": 29}}") + * (2, "{\"id\": 532161, \"age\": 17, \"relative\":{\"id\": 873513, \"age\": 47}}") + * ) + * val data_for_json_column = Seq("col1", "col2") + * val df_for_json = session.createDataFrame(data_for_json).toDF(data_for_json_column) + * + * val json_df = df_for_json.select( + * from_json(col("col2")).as("json") + * ) + * + * json_df.selectExpr( + * "json['id']::integer as id" + * , "json['age']::integer as age" + * , "json['relative']['id']::integer as rel_id" + * , "json['relative']['age']::integer as rel_age" + * ).show(10, 10000) + * + * + *
+ * ----------------------------------------- + * |"ID" |"AGE" |"REL_ID" |"REL_AGE" | + * ----------------------------------------- + * |172319 |41 |885471 |29 | + * |532161 |17 |873513 |47 | + * ----------------------------------------- + *+ * @since 1.15.0 + * @param e String column to convert to variant. + * @return Column object. + */ + def from_json(e: Column): Column = { + builtin("TRY_PARSE_JSON")(e) + } + + /** + * Returns the value of sourceExpr cast to data type + * targetType if possible, or NULL if not possible. + * Example:: + * + * df = session.create_dataframe(['0.12', 'USD 27.90', + * '13.97 USD', '97.0', '17,-'], schema=["a"]) + * df.select(try_cast(col("a"), FloatType()).as_('ans')).collect() + * [Row(ANS=0.12), Row(ANS=None), Row(ANS=None), Row(ANS=None), Row(ANS=None)] + * @since 1.15.0 + * @param source Any castable expression + * @param Target The type of the result + * @return The result is of type targetType. + * special version of CAST for a subset of datatype conversions. + * It performs the same operation + * (i.e. converts a value of one data type into another data type), + * but returns a NULL value instead of raising an error + * when the conversion can not be performed. + * The column argument must be a string column in Snowflake. + */ + def try_cast(e: Column, targetType: DataType): Column = { + e.cast(targetType) + } + + /** + * This function receives a date or timestamp, as well as a + * properly formatted string and subtracts the specified + * amount of days from it. If receiving a string, this string is + * casted to date using try_cast and if it's not possible to cast, + * returns null. If receiving + * a timestamp it will be casted to date (removing its time). + * Example:: + * + * >>> from snowflake.snowpark.functions import date_sub, to_date + * >>> df = session.createDataFrame([("1976-01-06")], ["date"]) + * >>> df = df.withColumn("date", to_date("date")) + * >>> df.withColumn("date", date_sub("date", 2)).show() + * -------------- + * |"DATE" | + * -------------- + * |1976-01-04 | + * -------------- + * """ + * + * @since 1.15.0 + * @param start Date, Timestamp or String column to subtract days from. + * @param days Days to subtract. + * @return Column object. + */ + def date_sub(start: Column, days: Int): Column = { + dateadd("DAY", lit(days * -1), try_cast(start, DateType)) + } + /* Returns a Column expression with values sorted in descending order. * Example: * {{{ diff --git a/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java b/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java index 89e9cbb5..5de58b3d 100644 --- a/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java +++ b/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java @@ -1,5 +1,7 @@ package com.snowflake.snowpark_test; +import static com.snowflake.snowpark_java.types.DataTypes.DateType; + import com.snowflake.snowpark_java.*; import java.sql.Date; import java.sql.Time; @@ -3186,4 +3188,81 @@ public void to_utc_timestamp() { Row[] expected = {Row.create(Timestamp.valueOf("2024-04-05 01:02:03.0"))}; checkAnswer(df.select(Functions.to_utc_timestamp(df.col("a"))), expected, false); } + + @Test + public void json_tuple1() { + DataFrame df = + getSession() + .sql( + "select parse_json(column1) as v, column2 as k from values ('{\"a\": null}','a'), " + + "('{\"a\": \"foo\"}','a'), ('{\"a\": \"foo\"}','b'), (null,'a')"); + df.show(); + DataFrame jsontupleDF = + df.select( + (Functions.json_tuple(Functions.col("v"), Functions.col("k")).toArray(new Column[0]))); + Row[] expected = { + Row.create((Object) null), + Row.create("foo"), + Row.create((Object) null), + Row.create((Object) null) + }; + checkAnswer(jsontupleDF, expected, false); + } + + @Test + public void json_tuple2() { + DataFrame df = + getSession() + .sql( + "select parse_json(column1) as v,column2 as id,column3 as name" + + " from values ( '{\"id\": 5,\"name\": \"Jose\", \"age\": 29}','id','name','age')"); + + DataFrame jsontupleDF = + df.select( + (Functions.json_tuple(Functions.col("v"), Functions.col("id"), Functions.col("name")) + .toArray(new Column[0]))); + jsontupleDF.show(); + Row[] expected = {Row.create(("5"), ("Jose"))}; + checkAnswer(jsontupleDF, expected, false); + } + + @Test + public void try_cast() { + DataFrame df = getSession().sql("select * from values('2024-04-05') as t(a)"); + Row[] expected = {Row.create(Date.valueOf("2024-04-05"))}; + checkAnswer(df.select(Functions.try_cast(df.col("a"), DateType)), expected, false); + } + + @Test + public void date_sub() { + DataFrame df = + getSession() + .sql( + "select * from values('2020-05-01 13:11:20.000' :: timestamp)," + + "('2020-08-21 01:30:05.000' :: timestamp) as T(a)"); + Row[] expected = { + Row.create(Date.valueOf("2020-04-30")), Row.create(Date.valueOf("2020-08-20")) + }; + checkAnswer(df.select(Functions.date_sub(df.col("a"), 1)), expected, false); + } + + @Test + public void from_json() { + DataFrame df = + getSession() + .sql( + "select parse_json(column1) as v,column2 as id,column3 as name" + + " from values ( '{\"id\": 5,\"name\": \"Jose\", \"age\": 29}','id','name','age')"); + + DataFrame jsonDF = df.select((Functions.from_json(Functions.col("v")))); + jsonDF.show(); + } + + @Test + public void cbrt() { + DataFrame df = getSession().sql("select column1 from values ( '5'),('1')"); + + Row[] expected = {Row.create((1.7099759466766968)), Row.create((1.0))}; + checkAnswer(df.select(Functions.cbrt(df.col("column1"))), expected, false); + } } diff --git a/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala b/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala index 8af28666..c350ca9d 100644 --- a/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala +++ b/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala @@ -2519,6 +2519,53 @@ trait FunctionSuite extends TestData { checkAnswer(data.select(to_utc_timestamp(col("a"))), expected, sort = false) } + test("cbrt") { + checkAnswer( + testData1.select(cbrt(col("NUM"))), + Seq(Row(1.0), Row(1.25992104989)), + sort = false) + } + test("from_json") { + val data_for_json = Seq( + (1, "{\"id\": 172319, \"age\": 41, \"relative\": {\"id\": 885471, \"age\": 29}}"), + (2, "{\"id\": 532161, \"age\": 17, \"relative\":{\"id\": 873513, \"age\": 47}}")) + val data_for_json_column = Seq("col1", "col2") + val df_for_json = session.createDataFrame(data_for_json).toDF(data_for_json_column) + val json_df = df_for_json.select(from_json(col("col2")).as("json")).show() + + } + + test("json_tuple") { + val json_tuple = builtin("JSON_EXTRACT_PATH_TEXT") + checkAnswer( + validJson1.select(json_tuple(col("v"), col("K"))), + Seq(Row(null), Row("foo"), Row(null), Row(null)), + sort = false) + } + + test("json_tuple1") { + val df = session.sql( + "select parse_json(column1) as v,column2 as id,column3 as name" + + " from values ( '{\"id\": 5,\"name\": \"Jose\", \"age\": 29}','id','name','age')") + df.show() + + checkAnswer( + df.select(json_tuple(col("v"), col("id"), col("name"))), + Seq(Row(("5"), ("Jose"))), + sort = false) + } + test("try_cast") { + val df = Seq("1", "2").toDF("a") + checkAnswer(df.select(try_cast(col("a"), IntegerType)), Seq(1, 2), sort = false) + } + test("date_sub") { + var expected = Seq(Date.valueOf("2020-04-30"), Date.valueOf("2020-08-20")).toDF("b") + checkAnswer( + timestamp1 + .select(date_sub(col("a"), 1)), + expected, + sort = false) + } } class EagerFunctionSuite extends FunctionSuite with EagerSession