From 6a9edb17bd9e4cf92e9f184ce473ecfff3530fb1 Mon Sep 17 00:00:00 2001 From: Shyamala Jayabalan Date: Thu, 12 Sep 2024 14:44:51 -0400 Subject: [PATCH] SNOW-802269-months_between_format_number (#159) * added new functions,testcases * added retrurn parameter to the comment section * added retrurn parameter to the comment section * Added column name as a string changes * modified the months_between --- .../snowflake/snowpark_java/Functions.java | 131 ++++++++++++++++++ .../com/snowflake/snowpark/functions.scala | 110 +++++++++++++++ .../snowpark_test/JavaFunctionSuite.java | 51 +++++++ .../snowpark_test/FunctionSuite.scala | 52 +++++++ 4 files changed, 344 insertions(+) diff --git a/src/main/java/com/snowflake/snowpark_java/Functions.java b/src/main/java/com/snowflake/snowpark_java/Functions.java index 99ed06f0..ce790653 100644 --- a/src/main/java/com/snowflake/snowpark_java/Functions.java +++ b/src/main/java/com/snowflake/snowpark_java/Functions.java @@ -4504,11 +4504,142 @@ public static Column from_unixtime(Column ut, String f) { * [Row(SEQ8(0)=0),Row(SEQ8(0)=1), Row(SEQ8(0)=2)] * } * + * @return A sequence of monotonically increasing integers, with wrap-around * which happens after + * largest representable integer of integer width 8 byte. * @since 1.15.0 */ public static Column monotonically_increasing_id() { return new Column(com.snowflake.snowpark.functions.monotonically_increasing_id()); } + /** + * Returns number of months between dates `start` and `end`. + * + *

A whole number is returned if both inputs have the same day of month or both are the last + * day of their respective months. Otherwise, the difference is calculated assuming 31 days per + * month. + * + *

For example: + * + *

{@code
+   * {{{
+   * months_between("2017-11-14", "2017-07-14")  // returns 4.0
+   * months_between("2017-01-01", "2017-01-10")  // returns 0.29032258
+   * months_between("2017-06-01", "2017-06-16 12:00:00")  // returns -0.5
+   * }}}
+   * }
+ * + * @param end A date, timestamp or string. If a string, the data must be in a format that can be + * cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @param start A date, timestamp or string. If a string, the data must be in a format that can + * cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @return A double, or null if either `end` or `start` were strings that could not be cast to a + * timestamp. Negative if `end` is before `start` + * @since 1.15.0 + */ + public static Column months_between(String end, String start) { + return new Column(functions.months_between(end, start)); + } + + /** + * Locate the position of the first occurrence of substr column in the given string. Returns null + * if either of the arguments are null. + * + *

Example + * + *

{@code
+   * SELECT id,
+   *        string1,
+   *        REGEXP_SUBSTR(string1, 'nevermore\\d') AS substring,
+   *        REGEXP_INSTR( string1, 'nevermore\\d') AS position
+   *   FROM demo1
+   *   ORDER BY id;
+   *
+   *   +----+-------------------------------------+------------+----------+
+   * | ID | STRING1                             | SUBSTRING  | POSITION |
+   * |----+-------------------------------------+------------+----------|
+   * |  1 | nevermore1, nevermore2, nevermore3. | nevermore1 |        1 |
+   * +----+-------------------------------------+------------+----------+
+   * }
+ * + * The position is not zero based, but 1 based index. Returns 0 if substr could not be found in + * str. + * + * @param str Column on which instr has to be applied + * @param substring Pattern to be retrieved + * @return A null if either of the arguments are null. + * @since 1.15.0 + */ + public static Column instr(Column str, String substring) { + return new Column(com.snowflake.snowpark.functions.instr(str.toScalaColumn(), substring)); + } + + /** + * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in UTC, and renders + * that time as a timestamp in the given time zone. For example, 'GMT+1' would yield '2017-07-14 + * 03:40:00.0'. + * + *

For Example + * + *

{@code
+   * ALTER SESSION SET TIMEZONE = 'America/Los_Angeles';
+   * SELECT TO_TIMESTAMP_TZ('2024-04-05 01:02:03');
+   *  +----------------------------------------+
+   * | TO_TIMESTAMP_TZ('2024-04-05 01:02:03') |
+   * |----------------------------------------|
+   * | 2024-04-05 01:02:03.000 -0700          |
+   * +----------------------------------------+
+   * }
+ * + * @param ts A date, timestamp or string. If a string, the data must be in a format that can be + * cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` A string detailing + * the time zone ID that the input should be adjusted to. It should be in the format of either + * region-based zone IDs or zone offsets. Region IDs must have the form 'area/city', such as + * 'America/Los_Angeles'. Zone offsets must be in the format '(+|-)HH:mm', for example + * '-08:00' or '+01:00'. Also 'UTC' and 'Z' are supported as aliases of '+00:00'. Other short + * names are not recommended to use because they can be ambiguous. + * @return A timestamp, or null if `ts` was a string that could not be cast to a timestamp or `tz` + * was an invalid value + * @since 1.15.0 + */ + public static Column from_utc_timestamp(Column ts) { + return new Column(com.snowflake.snowpark.functions.from_utc_timestamp(ts.toScalaColumn())); + } + + /** + * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in the given time zone, + * and renders that time as a timestamp in UTC. For example, 'GMT+1' would yield '2017-07-14 + * 01:40:00.0'. + * + * @param ts A date, timestamp or string. If a string, the data must be in a format that can be + * cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` A string detailing + * the time zone ID that the input should be adjusted to. It should be in the format of either + * region-based zone IDs or zone offsets. Region IDs must have the form 'area/city', such as + * 'America/Los_Angeles'. Zone offsets must be in the format '(+|-)HH:mm', for example + * '-08:00' or '+01:00'. Also 'UTC' and 'Z' are supported as aliases of '+00:00'. Other short + * names are not recommended to use because they can be ambiguous. + * @return A timestamp, or null if `ts` was a string that could not be cast to a timestamp or `tz` + * was an invalid value + * @since 1.15.0 + */ + public static Column to_utc_timestamp(Column ts) { + return new Column(com.snowflake.snowpark.functions.to_utc_timestamp(ts.toScalaColumn())); + } + + /** + * Formats numeric column x to a format like '#,###,###.##', rounded to d decimal places with + * HALF_EVEN round mode, and returns the result as a string column. + * + *

If d is 0, the result has no decimal point or fractional part. If d is less than 0, the + * result will be null. + * + * @param x numeric column to be transformed + * @param d Amount of decimal for the number format + * @return Number casted to the specific string format + * @since 1.15.0 + */ + public static Column format_number(Column x, Integer d) { + return new Column(com.snowflake.snowpark.functions.format_number(x.toScalaColumn(), d)); + } /* Returns a Column expression with values sorted in descending order. * diff --git a/src/main/scala/com/snowflake/snowpark/functions.scala b/src/main/scala/com/snowflake/snowpark/functions.scala index 587e5e76..264b9ffe 100644 --- a/src/main/scala/com/snowflake/snowpark/functions.scala +++ b/src/main/scala/com/snowflake/snowpark/functions.scala @@ -3777,6 +3777,116 @@ object functions { */ def monotonically_increasing_id(): Column = builtin("seq8")() + /** + * Returns number of months between dates `start` and `end`. + * + * A whole number is returned if both inputs have the same day of month or both are the last day + * of their respective months. Otherwise, the difference is calculated assuming 31 days per month. + * + * For example: + * {{{ + * months_between("2017-11-14", "2017-07-14") // returns 4.0 + * months_between("2017-01-01", "2017-01-10") // returns 0.29032258 + * months_between("2017-06-01", "2017-06-16 12:00:00") // returns -0.5 + * }}} + * @since 1.15.0 + * @param end Column name. If a string, the data must be in a format that can + * be cast to a timestamp, such as yyyy-MM-dd + * or yyyy-MM-dd HH:mm:ss.SSSS + * @param start Column name . If a string, the data must be in a format that can + * cast to a timestamp, such as yyyy-MM-dd or yyyy-MM-dd HH:mm:ss.SSSS + * @return A double, or null if either end or start were strings that could not be cast to a + * timestamp. Negative if end is before start + */ + def months_between(end: String, start: String): Column = + builtin("MONTHS_BETWEEN")(col(end), col(start)) + + /** + * Locate the position of the first occurrence of substr column in the given string. + * Returns null if either of the arguments are null. + * For example + * SELECT id, + * string1, + * REGEXP_SUBSTR(string1, 'nevermore\\d') AS substring, + * REGEXP_INSTR( string1, 'nevermore\\d') AS position + * FROM demo1 + * ORDER BY id; + * +----+-------------------------------------+------------+----------+ + * | ID | STRING1 | SUBSTRING | POSITION | + * |----+-------------------------------------+------------+----------| + * | 1 | nevermore1, nevermore2, nevermore3. | nevermore1 | 1 | + * +----+-------------------------------------+------------+----------+ + * + * @since 1.15.0 + * @note The position is not zero based, but 1 based index. Returns 0 if substr + * could not be found in str. + */ + def instr(str: Column, substring: String): Column = builtin("REGEXP_INSTR")(str, substring) + + /** + * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in UTC, and renders + * that time as a timestamp in the given time zone. For example, 'GMT+1' would yield + * '2017-07-14 03:40:00.0'. + * ALTER SESSION SET TIMEZONE = 'America/Los_Angeles'; + * SELECT TO_TIMESTAMP_TZ('2024-04-05 01:02:03'); + * +----------------------------------------+ + * | TO_TIMESTAMP_TZ('2024-04-05 01:02:03') | + * |----------------------------------------| + * | 2024-04-05 01:02:03.000 -0700 | + * +----------------------------------------+ + * + * @since 1.15.0 + * @param ts A date, timestamp or string. If a string, the data must be in a format that can be + * cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * A string detailing the time zone ID that the input should be adjusted to. It should + * be in the format of either region-based zone IDs or zone offsets. Region IDs must + * have the form 'area/city', such as 'America/Los_Angeles'. Zone offsets must be in + * the format '(+|-)HH:mm', for example '-08:00' or '+01:00'. Also 'UTC' and 'Z' are + * supported as aliases of '+00:00'. Other short names are not recommended to use + * because they can be ambiguous. + * @return A timestamp, or null if `ts` was a string that could not be cast to a timestamp or + * `tz` was an invalid value + */ + def from_utc_timestamp(ts: Column): Column = + builtin("TO_TIMESTAMP_TZ")(ts) + + /** + * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in the given time + * zone, and renders that time as a timestamp in UTC. For example, 'GMT+1' would yield + * '2017-07-14 01:40:00.0'. + * @since 1.15.0 + * @param ts A date, timestamp or string. If a string, the data must be in a format that can be + * cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * A string detailing the time zone ID that the input should be adjusted to. It should + * be in the format of either region-based zone IDs or zone offsets. Region IDs must + * have the form 'area/city', such as 'America/Los_Angeles'. Zone offsets must be in + * the format '(+|-)HH:mm', for example '-08:00' or '+01:00'. Also 'UTC' and 'Z' are + * supported as aliases of '+00:00'. Other short names are not recommended to use + * because they can be ambiguous. + * @return A timestamp, or null if `ts` was a string that could not be cast to a timestamp or + * `tz` was an invalid value + */ + def to_utc_timestamp(ts: Column): Column = builtin("TO_TIMESTAMP_TZ")(ts) + + /** + * Formats numeric column x to a format like '#,###,###.##', rounded to d decimal places + * with HALF_EVEN round mode, and returns the result as a string column. + * @since 1.15.0 + * If d is 0, the result has no decimal point or fractional part. + * If d is less than 0, the result will be null. + * + * @param x numeric column to be transformed + * @param d Amount of decimal for the number format + * + * @return Number casted to the specific string format + */ + def format_number(x: Column, d: Int): Column = { + if (d < 0) { + lit(null) + } else { + builtin("TO_VARCHAR")(x, if (d > 0) s"999,999.${"0" * d}" else "999,999") + } + } /* Returns a Column expression with values sorted in descending order. * Example: * {{{ diff --git a/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java b/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java index f47fb35a..89e9cbb5 100644 --- a/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java +++ b/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java @@ -3135,4 +3135,55 @@ public void unhex() { Row[] expected = {Row.create("1"), Row.create("2"), Row.create("3")}; checkAnswer(df.select(Functions.unhex(Functions.col("a"))), expected, false); } + + @Test + public void months_between() { + DataFrame df = + getSession() + .sql( + "select * from values('2010-07-02'::Date,'2010-08-02'::Date), " + + "('2020-08-02'::Date,'2020-12-02'::Date) as t(a,b)"); + Row[] expected = {Row.create(1.000000), Row.create(4.000000)}; + checkAnswer(df.select(Functions.months_between("b", "a")), expected, false); + } + + @Test + public void instr() { + DataFrame df = + getSession() + .sql( + "select * from values('It was the best of times, it was the worst of times') as t(a)"); + Row[] expected = {Row.create(4)}; + checkAnswer(df.select(Functions.instr(df.col("a"), "was")), expected, false); + } + + @Test + public void format_number1() { + DataFrame df = getSession().sql("select * from values(1),(2),(3) as t(a)"); + Row[] expected = {Row.create("1"), Row.create("2"), Row.create("3")}; + checkAnswer( + df.select(Functions.ltrim(Functions.format_number(df.col("a"), 0))), expected, false); + } + + @Test + public void format_number2() { + DataFrame df = getSession().sql("select * from values(1),(2),(3) as t(a)"); + Row[] expected = {Row.create("1.00"), Row.create("2.00"), Row.create("3.00")}; + checkAnswer( + df.select(Functions.ltrim(Functions.format_number(df.col("a"), 2))), expected, false); + } + + @Test + public void from_utc_timestamp() { + DataFrame df = getSession().sql("select * from values('2024-04-05 01:02:03') as t(a)"); + Row[] expected = {Row.create(Timestamp.valueOf("2024-04-05 01:02:03.0"))}; + checkAnswer(df.select(Functions.from_utc_timestamp(df.col("a"))), expected, false); + } + + @Test + public void to_utc_timestamp() { + DataFrame df = getSession().sql("select * from values('2024-04-05 01:02:03') as t(a)"); + Row[] expected = {Row.create(Timestamp.valueOf("2024-04-05 01:02:03.0"))}; + checkAnswer(df.select(Functions.to_utc_timestamp(df.col("a"))), expected, false); + } } diff --git a/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala b/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala index 7f6f8038..8af28666 100644 --- a/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala +++ b/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala @@ -2466,6 +2466,58 @@ trait FunctionSuite extends TestData { Seq(Row("1"), Row("2"), Row("3")), sort = false) } + test("months_between") { + val months_between = functions.builtin("MONTHS_BETWEEN") + val input = Seq( + (Date.valueOf("2010-08-02"), Date.valueOf("2010-07-02")), + (Date.valueOf("2020-12-02"), Date.valueOf("2020-08-02"))) + .toDF("a", "b") + checkAnswer( + input.select(months_between(col("a"), col("b"))), + Seq(Row((1.000000)), Row(4.000000)), + sort = false) + } + + test("instr") { + val df = Seq("It was the best of times, it was the worst of times").toDF("a") + checkAnswer(df.select(instr(col("a"), "was")), Seq(Row(4)), sort = false) + } + + test("format_number1") { + + checkAnswer( + number3.select(ltrim(format_number(col("a"), 0))), + Seq(Row(("1")), Row(("2")), Row(("3"))), + sort = false) + } + + test("format_number2") { + + checkAnswer( + number3.select(ltrim(format_number(col("a"), 2))), + Seq(Row(("1.00")), Row(("2.00")), Row(("3.00"))), + sort = false) + } + + test("format_number3") { + + checkAnswer( + number3.select(ltrim(format_number(col("a"), -1))), + Seq(Row((null)), Row((null)), Row((null))), + sort = false) + } + + test("from_utc_timestamp") { + val expected = Seq(Timestamp.valueOf("2024-04-05 01:02:03.0")).toDF("a") + val data = Seq("2024-04-05 01:02:03").toDF("a") + checkAnswer(data.select(from_utc_timestamp(col("a"))), expected, sort = false) + } + + test("to_utc_timestamp") { + val expected = Seq(Timestamp.valueOf("2024-04-05 01:02:03.0")).toDF("a") + val data = Seq("2024-04-05 01:02:03").toDF("a") + checkAnswer(data.select(to_utc_timestamp(col("a"))), expected, sort = false) + } }