From e4696552a7213e0a7dca3898769681c19515fac8 Mon Sep 17 00:00:00 2001 From: Shyamala Jayabalan Date: Mon, 5 Aug 2024 16:15:29 -0400 Subject: [PATCH 01/11] Added regexp_extract,signum,substring_index,collect_list 1) Added regexp_extract,signum,substring_index,collect_list to functions.scala . 2) Added test cases for the same --- .../com/snowflake/snowpark/functions.scala | 106 ++++++++++++++++++ .../snowpark_test/FunctionSuite.scala | 38 +++++++ 2 files changed, 144 insertions(+) diff --git a/src/main/scala/com/snowflake/snowpark/functions.scala b/src/main/scala/com/snowflake/snowpark/functions.scala index a7fd9ff0..cc3ca323 100644 --- a/src/main/scala/com/snowflake/snowpark/functions.scala +++ b/src/main/scala/com/snowflake/snowpark/functions.scala @@ -3140,6 +3140,112 @@ object functions { */ def listagg(col: Column): Column = listagg(col, "", isDistinct = false) + /** + * This function receives a column and extracts the groupIdx from the string + * after applying the exp regex. Returns empty string when the string doesn't + * match and null if the input is null. + * + * This function applies the `case sensitive` and `extract` flags. + * It doesn't apply multiline nor .* matches newlines. + * If these flags need to be applied, use `builtin("REGEXP_SUBSTR")` + * instead and apply the desired flags. + * + * Note: non-greedy tokens such as `.*?` are not supported + * @since 1.12.1 + * @param colName Column to apply regex. + * @param exp Regex expression to apply. + * @param grpIdx Group to extract. + * @return Column object. + */ + def regexp_extract( + colName: Column, + exp: String, + position: Int, + Occurences: Int, + grpIdx: Int): Column = { + when(colName.is_null, lit(null)) + .otherwise( + coalesce( + builtin("REGEXP_SUBSTR")( + colName, + lit(exp), + lit(position), + lit(Occurences), + lit("ce"), + lit(grpIdx)), + lit(""))) + } + + /** + * Returns the sign of the given column. Returns either 1 for positive, + * 0 for 0 or + * NaN, -1 for negative and null for null. + * NOTE: if string values are provided snowflake will attempts to cast. + * If it casts correctly, returns the calculation, + * if not an error will be thrown + * @since 1.12.1 + * @param e Column to calculate the sign. + * @return Column object. + */ + def signum(colName: Column): Column = { + builtin("SIGN")(colName) + } + + /** + * Returns the sign of the given column. Returns either 1 for positive, + * 0 for 0 or + * NaN, -1 for negative and null for null. + * NOTE: if string values are provided snowflake will attempts to cast. + * If it casts correctly, returns the calculation, + * if not an error will be thrown + * @since 1.12.1 + * @param columnName Name of the column to calculate the sign. + * @return Column object. + */ + def signum(columnName: String): Column = { + signum(col(columnName)) + } + + /** + * Returns the substring from string str before count occurrences + * of the delimiter delim. If count is positive, + * everything the left of the final delimiter (counting from left) + * is returned. If count is negative, every to the right of the + * final delimiter (counting from the right) is returned. + * substring_index performs a case-sensitive match when searching for delim. + * @since 1.12.1 + */ + def substring_index(str: Column, delim: String, count: Int): Column = { + when( + lit(count) < lit(0), + callBuiltin( + "substring", + lit(str), + callBuiltin("regexp_instr", sqlExpr(s"reverse(${str}, ${delim}, 1, abs(${count}), 0")))) + .otherwise( + callBuiltin( + "substring", + lit(str), + 1, + callBuiltin("regexp_instr", col("str"), lit(delim), 1, lit(count), 1))) + } + + /** + * Wrapper for Snowflake built-in collect_list function. Get the values of array column. + * @since 1.10.0 + * @param c Column to be collect. + * @return The array. + */ + def collect_list(c: Column): Column = array_agg(c) + + /** + * Wrapper for Snowflake built-in collect_list function. Get the values of array column. + * @since 1.10.0 + * @param s Column name to be collected. + * @return The array. + */ + def collect_list(s: String): Column = array_agg(col(s)) + /** * Invokes a built-in snowflake function with the specified name and arguments. * Arguments can be of two types diff --git a/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala b/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala index e473de12..21304cf7 100644 --- a/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala +++ b/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala @@ -2177,7 +2177,45 @@ trait FunctionSuite extends TestData { expected, sort = false) } + test("regexp_extract") { + val data = Seq("A MAN A PLAN A CANAL").toDF("a") + var expected = Seq(Row("MAN")) + checkAnswer( + data.select(regexp_extract(col("a"), "A\\W+(\\w+)", 1, 1, 1)), + expected, + sort = false) + expected = Seq(Row("PLAN")) + checkAnswer( + data.select(regexp_extract(col("a"), "A\\W+(\\w+)", 1, 2, 1)), + expected, + sort = false) + expected = Seq(Row("CANAL")) + checkAnswer( + data.select(regexp_extract(col("a"), "A\\W+(\\w+)", 1, 3, 1)), + expected, + sort = false) + + expected = Seq(Row(null)) + checkAnswer( + data.select(regexp_extract(col("a"), "A\\W+(\\w+)", 1, 4, 1)), + expected, + sort = false) + } + test("signum") { + val df = Seq(1, -2, 0).toDF("a") + checkAnswer(df.select(signum(col("a"))), Seq(Row(1), Row(-1), Row(0)), sort = false) + } + test("collect_list") { + assert(monthlySales.select(collect_list(col("amount"))).collect()(0).get(0).toString == + "[\n 10000,\n 400,\n 4500,\n 35000,\n 5000,\n 3000,\n 200,\n 90500,\n 6000,\n " + + "5000,\n 2500,\n 9500,\n 8000,\n 10000,\n 800,\n 4500\n]") + + } + test("substring_index") { + val df = Seq("It was the best of times, it was the worst of times").toDF("a") + checkAnswer(df.select(substring_index(col("a"), "was", 1)), Seq(Row(7)), sort = false) + } } class EagerFunctionSuite extends FunctionSuite with EagerSession From 72c63b4175415b70a9d50c709e99276a645b2f24 Mon Sep 17 00:00:00 2001 From: Shyamala Jayabalan Date: Tue, 6 Aug 2024 10:15:02 -0400 Subject: [PATCH 02/11] Added examples and updated the description --- .../com/snowflake/snowpark/functions.scala | 66 ++++++++++++++----- 1 file changed, 48 insertions(+), 18 deletions(-) diff --git a/src/main/scala/com/snowflake/snowpark/functions.scala b/src/main/scala/com/snowflake/snowpark/functions.scala index cc3ca323..94f9f9d5 100644 --- a/src/main/scala/com/snowflake/snowpark/functions.scala +++ b/src/main/scala/com/snowflake/snowpark/functions.scala @@ -3141,20 +3141,24 @@ object functions { def listagg(col: Column): Column = listagg(col, "", isDistinct = false) /** - * This function receives a column and extracts the groupIdx from the string - * after applying the exp regex. Returns empty string when the string doesn't - * match and null if the input is null. - * - * This function applies the `case sensitive` and `extract` flags. - * It doesn't apply multiline nor .* matches newlines. - * If these flags need to be applied, use `builtin("REGEXP_SUBSTR")` - * instead and apply the desired flags. - * + * Signature - snowflake.snowpark.functions.regexp_extract + * (value: Union[Column, str], regexp: Union[Column, str], idx: int) + * → Column + * Extract a specific group matched by a regex, from the specified string + * column. If the regex did not match, or the specified group did not match, + * an empty string is returned. + * Example: + * from snowflake.snowpark.functions import regexp_extract + * df = session.createDataFrame([["id_20_30", 10], ["id_40_50", 30]], ["id", "age"]) + * df.select(regexp_extract("id", r"(\d+)", 1).alias("RES")).show() + * --------- + * |"RES" | + * --------- + * |20 | + * |40 | + * --------- * Note: non-greedy tokens such as `.*?` are not supported * @since 1.12.1 - * @param colName Column to apply regex. - * @param exp Regex expression to apply. - * @param grpIdx Group to extract. * @return Column object. */ def regexp_extract( @@ -3177,12 +3181,23 @@ object functions { } /** - * Returns the sign of the given column. Returns either 1 for positive, - * 0 for 0 or - * NaN, -1 for negative and null for null. - * NOTE: if string values are provided snowflake will attempts to cast. - * If it casts correctly, returns the calculation, - * if not an error will be thrown + * Returns the sign of its argument: + * + * - -1 if the argument is negative. + * - 1 if it is positive. + * - 0 if it is 0. + * + * Args: + * col: The column to evaluate its sign + * + * Example:: + * >>> df = session.create_dataframe([(-2, 2, 0)], ["a", "b", "c"]) + * >>> df.select(sign("a").alias("a_sign"), sign("b").alias("b_sign"), sign("c").alias("c_sign")).show() + * ---------------------------------- + * |"A_SIGN" |"B_SIGN" |"C_SIGN" | + * ---------------------------------- + * |-1 |1 |0 | + * ---------------------------------- * @since 1.12.1 * @param e Column to calculate the sign. * @return Column object. @@ -3232,6 +3247,21 @@ object functions { /** * Wrapper for Snowflake built-in collect_list function. Get the values of array column. + * Returns the input values, pivoted into an ARRAY. If the input is empty, an empty + * ARRAY is returned. + * + * Example:: + * >>> df = session.create_dataframe([[1], [2], [3], [1]], schema=["a"]) + * >>> df.select(array_agg("a", True).alias("result")).show() + * ------------ + * |"RESULT" | + * ------------ + * |[ | + * | 1, | + * | 2, | + * | 3 | + * |] | + * ------------ * @since 1.10.0 * @param c Column to be collect. * @return The array. From 1e50ed571869ab3201794b42646bd6b13ef5773a Mon Sep 17 00:00:00 2001 From: Shyamala Jayabalan Date: Wed, 7 Aug 2024 14:22:29 -0400 Subject: [PATCH 03/11] Fixed format --- src/main/scala/com/snowflake/snowpark/functions.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main/scala/com/snowflake/snowpark/functions.scala b/src/main/scala/com/snowflake/snowpark/functions.scala index 94f9f9d5..83865414 100644 --- a/src/main/scala/com/snowflake/snowpark/functions.scala +++ b/src/main/scala/com/snowflake/snowpark/functions.scala @@ -3144,8 +3144,8 @@ object functions { * Signature - snowflake.snowpark.functions.regexp_extract * (value: Union[Column, str], regexp: Union[Column, str], idx: int) * → Column - * Extract a specific group matched by a regex, from the specified string - * column. If the regex did not match, or the specified group did not match, + * Extract a specific group matched by a regex, from the specified string + * column. If the regex did not match, or the specified group did not match, * an empty string is returned. * Example: * from snowflake.snowpark.functions import regexp_extract @@ -3192,7 +3192,8 @@ object functions { * * Example:: * >>> df = session.create_dataframe([(-2, 2, 0)], ["a", "b", "c"]) - * >>> df.select(sign("a").alias("a_sign"), sign("b").alias("b_sign"), sign("c").alias("c_sign")).show() + * >>> df.select(sign("a").alias("a_sign"), sign("b").alias("b_sign"), + * sign("c").alias("c_sign")).show() * ---------------------------------- * |"A_SIGN" |"B_SIGN" |"C_SIGN" | * ---------------------------------- From 17e50690f05c78fabee61352f11d63650e926779 Mon Sep 17 00:00:00 2001 From: Shyamala Jayabalan Date: Wed, 7 Aug 2024 14:29:39 -0400 Subject: [PATCH 04/11] formatted the comments --- .../scala/com/snowflake/snowpark/functions.scala | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/main/scala/com/snowflake/snowpark/functions.scala b/src/main/scala/com/snowflake/snowpark/functions.scala index 83865414..4adf4165 100644 --- a/src/main/scala/com/snowflake/snowpark/functions.scala +++ b/src/main/scala/com/snowflake/snowpark/functions.scala @@ -3143,21 +3143,22 @@ object functions { /** * Signature - snowflake.snowpark.functions.regexp_extract * (value: Union[Column, str], regexp: Union[Column, str], idx: int) - * → Column + * Column * Extract a specific group matched by a regex, from the specified string * column. If the regex did not match, or the specified group did not match, * an empty string is returned. * Example: * from snowflake.snowpark.functions import regexp_extract - * df = session.createDataFrame([["id_20_30", 10], ["id_40_50", 30]], ["id", "age"]) + * df = session.createDataFrame([["id_20_30", 10], ["id_40_50", 30]], + * ["id", "age"]) * df.select(regexp_extract("id", r"(\d+)", 1).alias("RES")).show() - * --------- - * |"RES" | - * --------- + * + * "RES" | + * * |20 | * |40 | - * --------- - * Note: non-greedy tokens such as `.*?` are not supported + * + * Note: non-greedy tokens such as are not supported * @since 1.12.1 * @return Column object. */ From dc7dd4a7a3afa067e733cb39a52ddd90b383c26e Mon Sep 17 00:00:00 2001 From: Shyamala Jayabalan Date: Wed, 7 Aug 2024 16:24:13 -0400 Subject: [PATCH 05/11] Added java functions and unit test cases for java --- .../snowflake/snowpark_java/Functions.java | 72 +++++++++++++++++++ .../com/snowflake/snowpark/functions.scala | 21 +++++- .../snowpark_test/JavaFunctionSuite.java | 44 ++++++++++++ 3 files changed, 134 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/snowflake/snowpark_java/Functions.java b/src/main/java/com/snowflake/snowpark_java/Functions.java index 56d8d08b..ef3c9718 100644 --- a/src/main/java/com/snowflake/snowpark_java/Functions.java +++ b/src/main/java/com/snowflake/snowpark_java/Functions.java @@ -3880,6 +3880,78 @@ public static Column listagg(Column col) { return new Column(com.snowflake.snowpark.functions.listagg(col.toScalaColumn())); } + /** + * Signature - snowflake.snowpark.functions.regexp_extract (value: Union[Column, str], regexp: + * Union[Column, str], idx: int) Column Extract a specific group matched by a regex, from the + * specified string column. If the regex did not match, or the specified group did not match, an + * empty string is returned. Example: from snowflake.snowpark.functions import regexp_extract df = + * session.createDataFrame([["id_20_30", 10], ["id_40_50", 30]], ["id", "age"]) + * df.select(regexp_extract("id", r"(\d+)", 1).alias("RES")).show() + * + *

"RES" | + * + *

|20 | |40 | + * + *

Note: non-greedy tokens such as are not supported + * + * @since 1.12.1 + * @return Column object. + */ + public static Column regexp_extract( + Column col, String exp, Integer position, Integer Occurences, Integer grpIdx) { + return new Column( + com.snowflake.snowpark.functions.regexp_extract( + col.toScalaColumn(), exp, position, Occurences, grpIdx)); + } + + /** + * Returns the sign of its argument: + * + *

- -1 if the argument is negative. - 1 if it is positive. - 0 if it is 0. + * + *

Args: col: The column to evaluate its sign + * + *

Example:: >>> df = session.create_dataframe([(-2, 2, 0)], ["a", "b", "c"]) >>> + * df.select(sign("a").alias("a_sign"), sign("b").alias("b_sign"), + * sign("c").alias("c_sign")).show() ---------------------------------- |"A_SIGN" |"B_SIGN" + * |"C_SIGN" | ---------------------------------- |-1 |1 |0 | ---------------------------------- + * + * @since 1.12.1 + * @param e Column to calculate the sign. + * @return Column object. + */ + public static Column signum(Column col) { + return new Column(com.snowflake.snowpark.functions.signum(col.toScalaColumn())); + } + + /** + * Returns the substring from string str before count occurrences of the delimiter delim. If count + * is positive, everything the left of the final delimiter (counting from left) is returned. If + * count is negative, every to the right of the final delimiter (counting from the right) is + * returned. substring_index performs a case-sensitive match when searching for delim. + * + * @since 1.12.1 + */ + public static Column substring_index(Column col, String delim, Integer count) { + return new Column( + com.snowflake.snowpark.functions.substring_index(col.toScalaColumn(), delim, count)); + } + + /** + * Wrapper for Snowflake built-in collect_list function. Get the values of array column. Returns + * the input values, pivoted into an ARRAY. If the input is empty, an empty ARRAY is returned. + * + *

Example:: >>> df = session.create_dataframe([[1], [2], [3], [1]], schema=["a"]) >>> + * df.select(array_agg("a", True).alias("result")).show() ------------ |"RESULT" | ------------ |[ + * | | 1, | | 2, | | 3 | |] | ------------ + * + * @since 1.10.0 + * @param c Column to be collect. + * @return The array. + */ + public static Column collect_list(Column col) { + return new Column(com.snowflake.snowpark.functions.collect_list(col.toScalaColumn())); + } /** * Calls a user-defined function (UDF) by name. * diff --git a/src/main/scala/com/snowflake/snowpark/functions.scala b/src/main/scala/com/snowflake/snowpark/functions.scala index 4adf4165..65ed38cf 100644 --- a/src/main/scala/com/snowflake/snowpark/functions.scala +++ b/src/main/scala/com/snowflake/snowpark/functions.scala @@ -3171,7 +3171,7 @@ object functions { when(colName.is_null, lit(null)) .otherwise( coalesce( - builtin("REGEXP_SUBSTR")( + builtin("REGEX_SUBSTR")( colName, lit(exp), lit(position), @@ -3248,7 +3248,7 @@ object functions { } /** - * Wrapper for Snowflake built-in collect_list function. Get the values of array column. + * * Returns the input values, pivoted into an ARRAY. If the input is empty, an empty * ARRAY is returned. * @@ -3271,7 +3271,22 @@ object functions { def collect_list(c: Column): Column = array_agg(c) /** - * Wrapper for Snowflake built-in collect_list function. Get the values of array column. + * + * Returns the input values, pivoted into an ARRAY. If the input is empty, an empty + * ARRAY is returned. + * + * Example:: + * >>> df = session.create_dataframe([[1], [2], [3], [1]], schema=["a"]) + * >>> df.select(array_agg("a", True).alias("result")).show() + * ------------ + * |"RESULT" | + * ------------ + * |[ | + * | 1, | + * | 2, | + * | 3 | + * |] | + * ------------ * @since 1.10.0 * @param s Column name to be collected. * @return The array. diff --git a/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java b/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java index 6ee298d3..0fafaf0c 100644 --- a/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java +++ b/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java @@ -2764,4 +2764,48 @@ public void any_value() { assert result.length == 1; assert result[0].getInt(0) == 1 || result[0].getInt(0) == 2 || result[0].getInt(0) == 3; } + + @Test + public void regexp_extract() { + DataFrame df = getSession().sql("select * from values('A MAN A PLAN A CANAL') as T(a)"); + Row[] expected = {Row.create("MAN")}; + checkAnswer( + df.select(Functions.regexp_extract(df.col("a"), "A\\W+(\\w+)", 1, 1, 1)), expected, false); + Row[] expected2 = {Row.create("PLAN")}; + checkAnswer( + df.select(Functions.regexp_extract(df.col("a"), "A\\W+(\\w+)", 1, 2, 1)), expected2, false); + Row[] expected3 = {Row.create("CANAL")}; + checkAnswer( + df.select(Functions.regexp_extract(df.col("a"), "A\\W+(\\w+)", 1, 2, 1)), expected3, false); + Row[] expected4 = {Row.create(null)}; + checkAnswer( + df.select(Functions.regexp_extract(df.col("a"), "A\\W+(\\w+)", 1, 3, 1)), expected4, false); + } + + @Test + public void signum() { + DataFrame df = getSession().sql("select * from values(1,-2,0) as T(a)"); + checkAnswer(df.select(Functions.signum(df.col("a"))), new Row[] {Row.create(1, -1, 0)}, false); + } + + @Test + public void collect_list() { + DataFrame df = getSession().sql("select * from values(10000,400,450) as T(a)"); + checkAnswer( + df.select(Functions.collect_list(df.col("a"))), + new Row[] {Row.create("[\n \"10000,400,450\"\n]")}, + false); + } + + @Test + public void substring_index() { + DataFrame df = + getSession() + .sql( + "select * from values ('It was the best of times,it was the worst of times') as T(a)"); + checkAnswer( + df.select(Functions.substring_index(df.col("a"), "was", 1)), + new Row[] {Row.create(7)}, + false); + } } From 34da52d4ce0c22f3a54cb68a44ddde59b6d3ac23 Mon Sep 17 00:00:00 2001 From: Shyamala Jayabalan Date: Wed, 7 Aug 2024 16:29:21 -0400 Subject: [PATCH 06/11] Added sign function --- .../snowflake/snowpark_java/Functions.java | 20 ++++++++++++++ .../com/snowflake/snowpark/functions.scala | 27 +++++++++++++++++++ .../snowpark_test/JavaFunctionSuite.java | 6 +++++ .../snowpark_test/FunctionSuite.scala | 4 +++ 4 files changed, 57 insertions(+) diff --git a/src/main/java/com/snowflake/snowpark_java/Functions.java b/src/main/java/com/snowflake/snowpark_java/Functions.java index ef3c9718..f7d199d5 100644 --- a/src/main/java/com/snowflake/snowpark_java/Functions.java +++ b/src/main/java/com/snowflake/snowpark_java/Functions.java @@ -3924,6 +3924,26 @@ public static Column signum(Column col) { return new Column(com.snowflake.snowpark.functions.signum(col.toScalaColumn())); } + /** + * Returns the sign of its argument: + * + *

- -1 if the argument is negative. - 1 if it is positive. - 0 if it is 0. + * + *

Args: col: The column to evaluate its sign + * + *

Example:: >>> df = session.create_dataframe([(-2, 2, 0)], ["a", "b", "c"]) >>> + * df.select(sign("a").alias("a_sign"), sign("b").alias("b_sign"), + * sign("c").alias("c_sign")).show() ---------------------------------- |"A_SIGN" |"B_SIGN" + * |"C_SIGN" | ---------------------------------- |-1 |1 |0 | ---------------------------------- + * + * @since 1.12.1 + * @param e Column to calculate the sign. + * @return Column object. + */ + public static Column sign(Column col) { + return new Column(com.snowflake.snowpark.functions.sign(col.toScalaColumn())); + } + /** * Returns the substring from string str before count occurrences of the delimiter delim. If count * is positive, everything the left of the final delimiter (counting from left) is returned. If diff --git a/src/main/scala/com/snowflake/snowpark/functions.scala b/src/main/scala/com/snowflake/snowpark/functions.scala index 65ed38cf..43c26cf1 100644 --- a/src/main/scala/com/snowflake/snowpark/functions.scala +++ b/src/main/scala/com/snowflake/snowpark/functions.scala @@ -3181,6 +3181,33 @@ object functions { lit(""))) } + /** + * Returns the sign of its argument: + * + * - -1 if the argument is negative. + * - 1 if it is positive. + * - 0 if it is 0. + * + * Args: + * col: The column to evaluate its sign + * + * Example:: + * >>> df = session.create_dataframe([(-2, 2, 0)], ["a", "b", "c"]) + * >>> df.select(sign("a").alias("a_sign"), sign("b").alias("b_sign"), + * sign("c").alias("c_sign")).show() + * ---------------------------------- + * |"A_SIGN" |"B_SIGN" |"C_SIGN" | + * ---------------------------------- + * |-1 |1 |0 | + * ---------------------------------- + * @since 1.12.1 + * @param e Column to calculate the sign. + * @return Column object. + */ + def sign(colName: Column): Column = { + builtin("SIGN")(colName) + } + /** * Returns the sign of its argument: * diff --git a/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java b/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java index 0fafaf0c..e4e2a34d 100644 --- a/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java +++ b/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java @@ -2788,6 +2788,12 @@ public void signum() { checkAnswer(df.select(Functions.signum(df.col("a"))), new Row[] {Row.create(1, -1, 0)}, false); } + @Test + public void sign() { + DataFrame df = getSession().sql("select * from values(1,-2,0) as T(a)"); + checkAnswer(df.select(Functions.sign(df.col("a"))), new Row[] {Row.create(1, -1, 0)}, false); + } + @Test public void collect_list() { DataFrame df = getSession().sql("select * from values(10000,400,450) as T(a)"); diff --git a/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala b/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala index 21304cf7..ea28cc5b 100644 --- a/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala +++ b/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala @@ -2205,6 +2205,10 @@ trait FunctionSuite extends TestData { val df = Seq(1, -2, 0).toDF("a") checkAnswer(df.select(signum(col("a"))), Seq(Row(1), Row(-1), Row(0)), sort = false) } + test("sign") { + val df = Seq(1, -2, 0).toDF("a") + checkAnswer(df.select(sign(col("a"))), Seq(Row(1), Row(-1), Row(0)), sort = false) + } test("collect_list") { assert(monthlySales.select(collect_list(col("amount"))).collect()(0).get(0).toString == From 330f36077ab815e856006a2bd4894580bb505fad Mon Sep 17 00:00:00 2001 From: Shyamala Jayabalan Date: Thu, 8 Aug 2024 12:53:03 -0400 Subject: [PATCH 07/11] Modified the alignment --- .../snowflake/snowpark_java/Functions.java | 66 ++++++++++++------- .../com/snowflake/snowpark/functions.scala | 26 +++++--- 2 files changed, 58 insertions(+), 34 deletions(-) diff --git a/src/main/java/com/snowflake/snowpark_java/Functions.java b/src/main/java/com/snowflake/snowpark_java/Functions.java index f7d199d5..ae5363e5 100644 --- a/src/main/java/com/snowflake/snowpark_java/Functions.java +++ b/src/main/java/com/snowflake/snowpark_java/Functions.java @@ -3884,15 +3884,19 @@ public static Column listagg(Column col) { * Signature - snowflake.snowpark.functions.regexp_extract (value: Union[Column, str], regexp: * Union[Column, str], idx: int) Column Extract a specific group matched by a regex, from the * specified string column. If the regex did not match, or the specified group did not match, an - * empty string is returned. Example: from snowflake.snowpark.functions import regexp_extract df = - * session.createDataFrame([["id_20_30", 10], ["id_40_50", 30]], ["id", "age"]) + * empty string is returned. + * Example: + *

{@code 
+   * from snowflake.snowpark.functions import regexp_extract
+   * df = session.createDataFrame([["id_20_30", 10], ["id_40_50", 30]], ["id", "age"])
    * df.select(regexp_extract("id", r"(\d+)", 1).alias("RES")).show()
-   *
-   * 

"RES" | - * - *

|20 | |40 | - * - *

Note: non-greedy tokens such as are not supported + * --------- + * |"RES" | + * --------- + * |20 | + * |40 | + * --------- + * } * * @since 1.12.1 * @return Column object. @@ -3909,12 +3913,18 @@ public static Column regexp_extract( * *

- -1 if the argument is negative. - 1 if it is positive. - 0 if it is 0. * - *

Args: col: The column to evaluate its sign - * - *

Example:: >>> df = session.create_dataframe([(-2, 2, 0)], ["a", "b", "c"]) >>> + *

Args: col: The column to evaluate its sign + * Example:: + * *

{@code df =
+   * session.create_dataframe([(-2, 2, 0)], ["a", "b", "c"]) >>>
    * df.select(sign("a").alias("a_sign"), sign("b").alias("b_sign"),
-   * sign("c").alias("c_sign")).show() ---------------------------------- |"A_SIGN" |"B_SIGN"
-   * |"C_SIGN" | ---------------------------------- |-1 |1 |0 | ----------------------------------
+   * sign("c").alias("c_sign")).show() 
+   *   ----------------------------------
+   *     |"A_SIGN"  |"B_SIGN"  |"C_SIGN"  |
+   *     ----------------------------------
+   *     |-1        |1         |0         |
+   *     ----------------------------------
+   * }
    *
    * @since 1.12.1
    * @param e Column to calculate the sign.
@@ -3929,12 +3939,18 @@ public static Column signum(Column col) {
    *
    * 

- -1 if the argument is negative. - 1 if it is positive. - 0 if it is 0. * - *

Args: col: The column to evaluate its sign - * - *

Example:: >>> df = session.create_dataframe([(-2, 2, 0)], ["a", "b", "c"]) >>> + *

Args: col: The column to evaluate its sign + * Example:: + *

{@code df =
+   * session.create_dataframe([(-2, 2, 0)], ["a", "b", "c"]) >>>
    * df.select(sign("a").alias("a_sign"), sign("b").alias("b_sign"),
-   * sign("c").alias("c_sign")).show() ---------------------------------- |"A_SIGN" |"B_SIGN"
-   * |"C_SIGN" | ---------------------------------- |-1 |1 |0 | ----------------------------------
+   * sign("c").alias("c_sign")).show() 
+   *   ----------------------------------
+   *     |"A_SIGN"  |"B_SIGN"  |"C_SIGN"  |
+   *     ----------------------------------
+   *     |-1        |1         |0         |
+   *     ----------------------------------
+   * }
    *
    * @since 1.12.1
    * @param e Column to calculate the sign.
@@ -3958,12 +3974,14 @@ public static Column substring_index(Column col, String delim, Integer count) {
   }
 
   /**
-   * Wrapper for Snowflake built-in collect_list function. Get the values of array column. Returns
-   * the input values, pivoted into an ARRAY. If the input is empty, an empty ARRAY is returned.
-   *
-   * 

Example:: >>> df = session.create_dataframe([[1], [2], [3], [1]], schema=["a"]) >>> - * df.select(array_agg("a", True).alias("result")).show() ------------ |"RESULT" | ------------ |[ - * | | 1, | | 2, | | 3 | |] | ------------ + * Returns the input values, pivoted into an ARRAY. If the input is empty, an empty ARRAY is + * returned. + *

Example:: + *

{@code
+   * df = session.create_dataframe([[1], [2], [3], [1]], schema=["a"])
+   * df.select(array_agg("a", True).alias("result")).show() 
+   * "RESULT" [ 1, 2, 3 ] 
+   * }
* * @since 1.10.0 * @param c Column to be collect. diff --git a/src/main/scala/com/snowflake/snowpark/functions.scala b/src/main/scala/com/snowflake/snowpark/functions.scala index 43c26cf1..995931a2 100644 --- a/src/main/scala/com/snowflake/snowpark/functions.scala +++ b/src/main/scala/com/snowflake/snowpark/functions.scala @@ -3147,17 +3147,20 @@ object functions { * Extract a specific group matched by a regex, from the specified string * column. If the regex did not match, or the specified group did not match, * an empty string is returned. - * Example: + * Example: * from snowflake.snowpark.functions import regexp_extract * df = session.createDataFrame([["id_20_30", 10], ["id_40_50", 30]], * ["id", "age"]) * df.select(regexp_extract("id", r"(\d+)", 1).alias("RES")).show() - * - * "RES" | - * - * |20 | - * |40 | - * + * + * + * --------- + * |"RES" | + * --------- + * |20 | + * |40 | + * --------- + * * Note: non-greedy tokens such as are not supported * @since 1.12.1 * @return Column object. @@ -3190,7 +3193,7 @@ object functions { * * Args: * col: The column to evaluate its sign - * + * * Example:: * >>> df = session.create_dataframe([(-2, 2, 0)], ["a", "b", "c"]) * >>> df.select(sign("a").alias("a_sign"), sign("b").alias("b_sign"), @@ -3200,6 +3203,7 @@ object functions { * ---------------------------------- * |-1 |1 |0 | * ---------------------------------- + * * @since 1.12.1 * @param e Column to calculate the sign. * @return Column object. @@ -3217,7 +3221,7 @@ object functions { * * Args: * col: The column to evaluate its sign - * + * * Example:: * >>> df = session.create_dataframe([(-2, 2, 0)], ["a", "b", "c"]) * >>> df.select(sign("a").alias("a_sign"), sign("b").alias("b_sign"), @@ -3227,6 +3231,7 @@ object functions { * ---------------------------------- * |-1 |1 |0 | * ---------------------------------- + * * @since 1.12.1 * @param e Column to calculate the sign. * @return Column object. @@ -3278,7 +3283,7 @@ object functions { * * Returns the input values, pivoted into an ARRAY. If the input is empty, an empty * ARRAY is returned. - * + * * Example:: * >>> df = session.create_dataframe([[1], [2], [3], [1]], schema=["a"]) * >>> df.select(array_agg("a", True).alias("result")).show() @@ -3291,6 +3296,7 @@ object functions { * | 3 | * |] | * ------------ + * * @since 1.10.0 * @param c Column to be collect. * @return The array. From 2fe1b0ddff7a5737abd1951bf8b74c5d5cd8dcf5 Mon Sep 17 00:00:00 2001 From: Shyamala Jayabalan Date: Thu, 8 Aug 2024 13:11:56 -0400 Subject: [PATCH 08/11] Added examples --- .../java/com/snowflake/snowpark_java/Functions.java | 11 +++++++++++ src/main/scala/com/snowflake/snowpark/functions.scala | 11 +++++++++++ 2 files changed, 22 insertions(+) diff --git a/src/main/java/com/snowflake/snowpark_java/Functions.java b/src/main/java/com/snowflake/snowpark_java/Functions.java index ae5363e5..54f104b1 100644 --- a/src/main/java/com/snowflake/snowpark_java/Functions.java +++ b/src/main/java/com/snowflake/snowpark_java/Functions.java @@ -3965,6 +3965,17 @@ public static Column sign(Column col) { * is positive, everything the left of the final delimiter (counting from left) is returned. If * count is negative, every to the right of the final delimiter (counting from the right) is * returned. substring_index performs a case-sensitive match when searching for delim. + * Example + * {@code + * select id, string1, + * regexp_instr( string1, 'A\\W+(\\w+)', 1, 1, 0, 'e', 1) as "POSITION1", + * regexp_instr( string1, 'A\\W+(\\w+)', 1, 2, 0, 'e', 1) as "POSITION2", + * regexp_instr( string1, 'A\\W+(\\w+)', 1, 3, 0, 'e', 1) as "POSITION3", + * regexp_instr( string1, 'A\\W+(\\w+)', 1, 4, 0, 'e', 1) as "POSITION4" + * from demo3; + * ID STRING1 POSITION1 POSITION2 POSITION3 POSITION4 + * 5 A MAN A PLAN A CANAL 3 9 16 0 + * } * * @since 1.12.1 */ diff --git a/src/main/scala/com/snowflake/snowpark/functions.scala b/src/main/scala/com/snowflake/snowpark/functions.scala index 995931a2..8e6a5455 100644 --- a/src/main/scala/com/snowflake/snowpark/functions.scala +++ b/src/main/scala/com/snowflake/snowpark/functions.scala @@ -3262,6 +3262,17 @@ object functions { * is returned. If count is negative, every to the right of the * final delimiter (counting from the right) is returned. * substring_index performs a case-sensitive match when searching for delim. + * Example + * + * select id, string1, + * regexp_instr( string1, 'A\\W+(\\w+)', 1, 1, 0, 'e', 1) as "POSITION1", + * regexp_instr( string1, 'A\\W+(\\w+)', 1, 2, 0, 'e', 1) as "POSITION2", + * regexp_instr( string1, 'A\\W+(\\w+)', 1, 3, 0, 'e', 1) as "POSITION3", + * regexp_instr( string1, 'A\\W+(\\w+)', 1, 4, 0, 'e', 1) as "POSITION4" + * from demo3; + * ID STRING1 POSITION1 POSITION2 POSITION3 POSITION4 + * 5 A MAN A PLAN A CANAL 3 9 16 0 + * * @since 1.12.1 */ def substring_index(str: Column, delim: String, count: Int): Column = { From 457d493a2c3e97846b7808fb00caba6efad5b2e0 Mon Sep 17 00:00:00 2001 From: Shyamala Jayabalan Date: Thu, 8 Aug 2024 13:25:28 -0400 Subject: [PATCH 09/11] adjusted comments --- .../java/com/snowflake/snowpark_java/Functions.java | 13 +------------ .../scala/com/snowflake/snowpark/functions.scala | 11 ----------- 2 files changed, 1 insertion(+), 23 deletions(-) diff --git a/src/main/java/com/snowflake/snowpark_java/Functions.java b/src/main/java/com/snowflake/snowpark_java/Functions.java index 54f104b1..ef75c513 100644 --- a/src/main/java/com/snowflake/snowpark_java/Functions.java +++ b/src/main/java/com/snowflake/snowpark_java/Functions.java @@ -3965,18 +3965,7 @@ public static Column sign(Column col) { * is positive, everything the left of the final delimiter (counting from left) is returned. If * count is negative, every to the right of the final delimiter (counting from the right) is * returned. substring_index performs a case-sensitive match when searching for delim. - * Example - * {@code - * select id, string1, - * regexp_instr( string1, 'A\\W+(\\w+)', 1, 1, 0, 'e', 1) as "POSITION1", - * regexp_instr( string1, 'A\\W+(\\w+)', 1, 2, 0, 'e', 1) as "POSITION2", - * regexp_instr( string1, 'A\\W+(\\w+)', 1, 3, 0, 'e', 1) as "POSITION3", - * regexp_instr( string1, 'A\\W+(\\w+)', 1, 4, 0, 'e', 1) as "POSITION4" - * from demo3; - * ID STRING1 POSITION1 POSITION2 POSITION3 POSITION4 - * 5 A MAN A PLAN A CANAL 3 9 16 0 - * } - * + * * @since 1.12.1 */ public static Column substring_index(Column col, String delim, Integer count) { diff --git a/src/main/scala/com/snowflake/snowpark/functions.scala b/src/main/scala/com/snowflake/snowpark/functions.scala index 8e6a5455..995931a2 100644 --- a/src/main/scala/com/snowflake/snowpark/functions.scala +++ b/src/main/scala/com/snowflake/snowpark/functions.scala @@ -3262,17 +3262,6 @@ object functions { * is returned. If count is negative, every to the right of the * final delimiter (counting from the right) is returned. * substring_index performs a case-sensitive match when searching for delim. - * Example - * - * select id, string1, - * regexp_instr( string1, 'A\\W+(\\w+)', 1, 1, 0, 'e', 1) as "POSITION1", - * regexp_instr( string1, 'A\\W+(\\w+)', 1, 2, 0, 'e', 1) as "POSITION2", - * regexp_instr( string1, 'A\\W+(\\w+)', 1, 3, 0, 'e', 1) as "POSITION3", - * regexp_instr( string1, 'A\\W+(\\w+)', 1, 4, 0, 'e', 1) as "POSITION4" - * from demo3; - * ID STRING1 POSITION1 POSITION2 POSITION3 POSITION4 - * 5 A MAN A PLAN A CANAL 3 9 16 0 - * * @since 1.12.1 */ def substring_index(str: Column, delim: String, count: Int): Column = { From 4ad7078dbfad7e7574bb2df6648f0eccb4c41ae7 Mon Sep 17 00:00:00 2001 From: Shyamala Jayabalan Date: Mon, 12 Aug 2024 12:59:59 -0400 Subject: [PATCH 10/11] Update Functions.java --- .../snowflake/snowpark_java/Functions.java | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/main/java/com/snowflake/snowpark_java/Functions.java b/src/main/java/com/snowflake/snowpark_java/Functions.java index ef75c513..a9102548 100644 --- a/src/main/java/com/snowflake/snowpark_java/Functions.java +++ b/src/main/java/com/snowflake/snowpark_java/Functions.java @@ -3884,9 +3884,9 @@ public static Column listagg(Column col) { * Signature - snowflake.snowpark.functions.regexp_extract (value: Union[Column, str], regexp: * Union[Column, str], idx: int) Column Extract a specific group matched by a regex, from the * specified string column. If the regex did not match, or the specified group did not match, an - * empty string is returned. - * Example: - *
{@code 
+   * empty string is returned.
+   * Example:
+   *  
{@code
    * from snowflake.snowpark.functions import regexp_extract
    * df = session.createDataFrame([["id_20_30", 10], ["id_40_50", 30]], ["id", "age"])
    * df.select(regexp_extract("id", r"(\d+)", 1).alias("RES")).show()
@@ -3913,12 +3913,12 @@ public static Column regexp_extract(
    *
    * 

- -1 if the argument is negative. - 1 if it is positive. - 0 if it is 0. * - *

Args: col: The column to evaluate its sign - * Example:: + *

Args: col: The column to evaluate its sign + * Example:: * *

{@code df =
    * session.create_dataframe([(-2, 2, 0)], ["a", "b", "c"]) >>>
    * df.select(sign("a").alias("a_sign"), sign("b").alias("b_sign"),
-   * sign("c").alias("c_sign")).show() 
+   * sign("c").alias("c_sign")).show()
    *   ----------------------------------
    *     |"A_SIGN"  |"B_SIGN"  |"C_SIGN"  |
    *     ----------------------------------
@@ -3939,12 +3939,12 @@ public static Column signum(Column col) {
    *
    * 

- -1 if the argument is negative. - 1 if it is positive. - 0 if it is 0. * - *

Args: col: The column to evaluate its sign - * Example:: + *

Args: col: The column to evaluate its sign + * Example:: *

{@code df =
    * session.create_dataframe([(-2, 2, 0)], ["a", "b", "c"]) >>>
    * df.select(sign("a").alias("a_sign"), sign("b").alias("b_sign"),
-   * sign("c").alias("c_sign")).show() 
+   * sign("c").alias("c_sign")).show()
    *   ----------------------------------
    *     |"A_SIGN"  |"B_SIGN"  |"C_SIGN"  |
    *     ----------------------------------
@@ -3965,7 +3965,7 @@ public static Column sign(Column col) {
    * is positive, everything the left of the final delimiter (counting from left) is returned. If
    * count is negative, every to the right of the final delimiter (counting from the right) is
    * returned. substring_index performs a case-sensitive match when searching for delim.
-   * 
+   *
    * @since 1.12.1
    */
   public static Column substring_index(Column col, String delim, Integer count) {
@@ -3975,12 +3975,14 @@ public static Column substring_index(Column col, String delim, Integer count) {
 
   /**
    * Returns the input values, pivoted into an ARRAY. If the input is empty, an empty ARRAY is
-   * returned. 
-   * 

Example:: - *

{@code
+   * returned.
+   *
+   * 

Example:: + * + *

{@code
    * df = session.create_dataframe([[1], [2], [3], [1]], schema=["a"])
-   * df.select(array_agg("a", True).alias("result")).show() 
-   * "RESULT" [ 1, 2, 3 ] 
+   * df.select(array_agg("a", True).alias("result")).show()
+   * "RESULT" [ 1, 2, 3 ]
    * }
* * @since 1.10.0 From 2669d2392540b93343ffb512996319d1e4ed60ea Mon Sep 17 00:00:00 2001 From: Shyamala Jayabalan Date: Tue, 13 Aug 2024 14:29:07 -0400 Subject: [PATCH 11/11] Fixed indentation issues --- src/main/java/com/snowflake/snowpark_java/Functions.java | 5 ++--- src/main/scala/com/snowflake/snowpark/functions.scala | 3 +-- .../java/com/snowflake/snowpark_test/JavaFunctionSuite.java | 5 +++-- .../scala/com/snowflake/snowpark_test/FunctionSuite.scala | 4 +--- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/main/java/com/snowflake/snowpark_java/Functions.java b/src/main/java/com/snowflake/snowpark_java/Functions.java index 41c38135..4b3bdb1d 100644 --- a/src/main/java/com/snowflake/snowpark_java/Functions.java +++ b/src/main/java/com/snowflake/snowpark_java/Functions.java @@ -3882,7 +3882,7 @@ public static Column listagg(Column col) { } /** - + * * Signature - snowflake.snowpark.functions.regexp_extract (value: Union[Column, str], regexp: * Union[Column, str], idx: int) Column Extract a specific group matched by a regex, from the * specified string column. If the regex did not match, or the specified group did not match, an @@ -3995,7 +3995,7 @@ public static Column collect_list(Column col) { return new Column(com.snowflake.snowpark.functions.collect_list(col.toScalaColumn())); } - * Returns a Column expression with values sorted in descending order. + /* Returns a Column expression with values sorted in descending order. * *

Example: order column values in descending * @@ -4166,7 +4166,6 @@ public static Column last(Column col) { return new Column(functions.last(col.toScalaColumn())); } - /** * Calls a user-defined function (UDF) by name. * diff --git a/src/main/scala/com/snowflake/snowpark/functions.scala b/src/main/scala/com/snowflake/snowpark/functions.scala index a28be119..588d8290 100644 --- a/src/main/scala/com/snowflake/snowpark/functions.scala +++ b/src/main/scala/com/snowflake/snowpark/functions.scala @@ -3328,7 +3328,7 @@ object functions { */ def collect_list(s: String): Column = array_agg(col(s)) - * Returns a Column expression with values sorted in descending order. + /* Returns a Column expression with values sorted in descending order. * Example: * {{{ * val df = session.createDataFrame(Seq(1, 2, 3)).toDF("id") @@ -3498,7 +3498,6 @@ object functions { def last(c: Column): Column = builtin("LAST_VALUE")(c) - /** * Invokes a built-in snowflake function with the specified name and arguments. * Arguments can be of two types diff --git a/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java b/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java index 954cb278..33f2904a 100644 --- a/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java +++ b/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java @@ -2766,7 +2766,6 @@ public void any_value() { } @Test - public void regexp_extract() { DataFrame df = getSession().sql("select * from values('A MAN A PLAN A CANAL') as T(a)"); Row[] expected = {Row.create("MAN")}; @@ -2813,7 +2812,10 @@ public void substring_index() { checkAnswer( df.select(Functions.substring_index(df.col("a"), "was", 1)), new Row[] {Row.create(7)}, + false); + } + @Test public void test_asc() { DataFrame df = getSession().sql("select * from values(3),(1),(2) as t(a)"); Row[] expected = {Row.create(1), Row.create(2), Row.create(3)}; @@ -2874,7 +2876,6 @@ public void last() { Functions.last(df.col("name")) .over(Window.partitionBy(df.col("grade")).orderBy(df.col("score").desc()))), expected, - false); } } diff --git a/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala b/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala index 22f474ea..1420bb10 100644 --- a/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala +++ b/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala @@ -2195,7 +2195,6 @@ trait FunctionSuite extends TestData { expected, sort = false) - expected = Seq(Row(null)) checkAnswer( data.select(regexp_extract(col("a"), "A\\W+(\\w+)", 1, 4, 1)), @@ -2221,7 +2220,7 @@ trait FunctionSuite extends TestData { val df = Seq("It was the best of times, it was the worst of times").toDF("a") checkAnswer(df.select(substring_index(col("a"), "was", 1)), Seq(Row(7)), sort = false) } - + test("desc column order") { val input = Seq(1, 2, 3).toDF("data") val expected = Seq(3, 2, 1).toDF("data") @@ -2289,7 +2288,6 @@ trait FunctionSuite extends TestData { sort = false) } - } class EagerFunctionSuite extends FunctionSuite with EagerSession