SNOW-802269-months_between_format_number (#159)

* added new functions,testcases * added retrurn parameter to the comment section * added retrurn parameter to the comment section * Added column name as a string changes * modified the months_between
snowflakedb · Sep 12, 2024 · 6a9edb1 · 6a9edb1
1 parent eed60a4
commit 6a9edb1
Show file tree

Hide file tree

Showing 4 changed files with 344 additions and 0 deletions.
diff --git a/src/main/java/com/snowflake/snowpark_java/Functions.java b/src/main/java/com/snowflake/snowpark_java/Functions.java
@@ -4504,11 +4504,142 @@ public static Column from_unixtime(Column ut, String f) {
    * [Row(SEQ8(0)=0),Row(SEQ8(0)=1), Row(SEQ8(0)=2)]
    * }</pre>
    *
+   * @return A sequence of monotonically increasing integers, with wrap-around * which happens after
+   *     largest representable integer of integer width 8 byte.
    * @since 1.15.0
    */
   public static Column monotonically_increasing_id() {
     return new Column(com.snowflake.snowpark.functions.monotonically_increasing_id());
   }
+  /**
+   * Returns number of months between dates `start` and `end`.
+   *
+   * <p>A whole number is returned if both inputs have the same day of month or both are the last
+   * day of their respective months. Otherwise, the difference is calculated assuming 31 days per
+   * month.
+   *
+   * <p>For example:
+   *
+   * <pre>{@code
+   * {{{
+   * months_between("2017-11-14", "2017-07-14")  // returns 4.0
+   * months_between("2017-01-01", "2017-01-10")  // returns 0.29032258
+   * months_between("2017-06-01", "2017-06-16 12:00:00")  // returns -0.5
+   * }}}
+   * }</pre>
+   *
+   * @param end A date, timestamp or string. If a string, the data must be in a format that can be
+   *     cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS`
+   * @param start A date, timestamp or string. If a string, the data must be in a format that can
+   *     cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS`
+   * @return A double, or null if either `end` or `start` were strings that could not be cast to a
+   *     timestamp. Negative if `end` is before `start`
+   * @since 1.15.0
+   */
+  public static Column months_between(String end, String start) {
+    return new Column(functions.months_between(end, start));
+  }
+
+  /**
+   * Locate the position of the first occurrence of substr column in the given string. Returns null
+   * if either of the arguments are null.
+   *
+   * <p>Example
+   *
+   * <pre>{@code
+   * SELECT id,
+   *        string1,
+   *        REGEXP_SUBSTR(string1, 'nevermore\\d') AS substring,
+   *        REGEXP_INSTR( string1, 'nevermore\\d') AS position
+   *   FROM demo1
+   *   ORDER BY id;
+   *
+   *   +----+-------------------------------------+------------+----------+
+   * | ID | STRING1                             | SUBSTRING  | POSITION |
+   * |----+-------------------------------------+------------+----------|
+   * |  1 | nevermore1, nevermore2, nevermore3. | nevermore1 |        1 |
+   * +----+-------------------------------------+------------+----------+
+   * }</pre>
+   *
+   * The position is not zero based, but 1 based index. Returns 0 if substr could not be found in
+   * str.
+   *
+   * @param str Column on which instr has to be applied
+   * @param substring Pattern to be retrieved
+   * @return A null if either of the arguments are null.
+   * @since 1.15.0
+   */
+  public static Column instr(Column str, String substring) {
+    return new Column(com.snowflake.snowpark.functions.instr(str.toScalaColumn(), substring));
+  }
+
+  /**
+   * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in UTC, and renders
+   * that time as a timestamp in the given time zone. For example, 'GMT+1' would yield '2017-07-14
+   * 03:40:00.0'.
+   *
+   * <p>For Example
+   *
+   * <pre>{@code
+   * ALTER SESSION SET TIMEZONE = 'America/Los_Angeles';
+   * SELECT TO_TIMESTAMP_TZ('2024-04-05 01:02:03');
+   *  +----------------------------------------+
+   * | TO_TIMESTAMP_TZ('2024-04-05 01:02:03') |
+   * |----------------------------------------|
+   * | 2024-04-05 01:02:03.000 -0700          |
+   * +----------------------------------------+
+   * }</pre>
+   *
+   * @param ts A date, timestamp or string. If a string, the data must be in a format that can be
+   *     cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` A string detailing
+   *     the time zone ID that the input should be adjusted to. It should be in the format of either
+   *     region-based zone IDs or zone offsets. Region IDs must have the form 'area/city', such as
+   *     'America/Los_Angeles'. Zone offsets must be in the format '(+|-)HH:mm', for example
+   *     '-08:00' or '+01:00'. Also 'UTC' and 'Z' are supported as aliases of '+00:00'. Other short
+   *     names are not recommended to use because they can be ambiguous.
+   * @return A timestamp, or null if `ts` was a string that could not be cast to a timestamp or `tz`
+   *     was an invalid value
+   * @since 1.15.0
+   */
+  public static Column from_utc_timestamp(Column ts) {
+    return new Column(com.snowflake.snowpark.functions.from_utc_timestamp(ts.toScalaColumn()));
+  }
+
+  /**
+   * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in the given time zone,
+   * and renders that time as a timestamp in UTC. For example, 'GMT+1' would yield '2017-07-14
+   * 01:40:00.0'.
+   *
+   * @param ts A date, timestamp or string. If a string, the data must be in a format that can be
+   *     cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` A string detailing
+   *     the time zone ID that the input should be adjusted to. It should be in the format of either
+   *     region-based zone IDs or zone offsets. Region IDs must have the form 'area/city', such as
+   *     'America/Los_Angeles'. Zone offsets must be in the format '(+|-)HH:mm', for example
+   *     '-08:00' or '+01:00'. Also 'UTC' and 'Z' are supported as aliases of '+00:00'. Other short
+   *     names are not recommended to use because they can be ambiguous.
+   * @return A timestamp, or null if `ts` was a string that could not be cast to a timestamp or `tz`
+   *     was an invalid value
+   * @since 1.15.0
+   */
+  public static Column to_utc_timestamp(Column ts) {
+    return new Column(com.snowflake.snowpark.functions.to_utc_timestamp(ts.toScalaColumn()));
+  }
+
+  /**
+   * Formats numeric column x to a format like '#,###,###.##', rounded to d decimal places with
+   * HALF_EVEN round mode, and returns the result as a string column.
+   *
+   * <p>If d is 0, the result has no decimal point or fractional part. If d is less than 0, the
+   * result will be null.
+   *
+   * @param x numeric column to be transformed
+   * @param d Amount of decimal for the number format
+   * @return Number casted to the specific string format
+   * @since 1.15.0
+   */
+  public static Column format_number(Column x, Integer d) {
+    return new Column(com.snowflake.snowpark.functions.format_number(x.toScalaColumn(), d));
+  }
 
   /* Returns a Column expression with values sorted in descending order.
    *

diff --git a/src/main/scala/com/snowflake/snowpark/functions.scala b/src/main/scala/com/snowflake/snowpark/functions.scala
@@ -3777,6 +3777,116 @@ object functions {
    */
   def monotonically_increasing_id(): Column = builtin("seq8")()
 
+  /**
+   * Returns number of months between dates `start` and `end`.
+   *
+   * A whole number is returned if both inputs have the same day of month or both are the last day
+   * of their respective months. Otherwise, the difference is calculated assuming 31 days per month.
+   *
+   * For example:
+   * {{{
+   * months_between("2017-11-14", "2017-07-14")  // returns 4.0
+   * months_between("2017-01-01", "2017-01-10")  // returns 0.29032258
+   * months_between("2017-06-01", "2017-06-16 12:00:00")  // returns -0.5
+   * }}}
+   * @since 1.15.0
+   * @param end  Column name. If a string, the data must be in a format that can
+   *              be cast to a timestamp, such as yyyy-MM-dd
+   *              or yyyy-MM-dd HH:mm:ss.SSSS
+   * @param start  Column name . If a string, the data must be in a format that can
+   *              cast to a timestamp, such as yyyy-MM-dd or yyyy-MM-dd HH:mm:ss.SSSS
+   * @return A double, or null if either end or start were strings that could not be cast to a
+   *         timestamp. Negative if end is before start
+   */
+  def months_between(end: String, start: String): Column =
+    builtin("MONTHS_BETWEEN")(col(end), col(start))
+
+  /**
+   * Locate the position of the first occurrence of substr column in the given string.
+   * Returns null if either of the arguments are null.
+   * For example
+   * SELECT id,
+   *        string1,
+   *         REGEXP_SUBSTR(string1, 'nevermore\\d') AS substring,
+   *        REGEXP_INSTR( string1, 'nevermore\\d') AS position
+   *    FROM demo1
+   *    ORDER BY id;
+   * +----+-------------------------------------+------------+----------+
+   *  | ID | STRING1                             | SUBSTRING  | POSITION |
+   *  |----+-------------------------------------+------------+----------|
+   *  |  1 | nevermore1, nevermore2, nevermore3. | nevermore1 |        1 |
+   *  +----+-------------------------------------+------------+----------+
+   *
+   * @since 1.15.0
+   * @note The position is not zero based, but 1 based index. Returns 0 if substr
+   * could not be found in str.
+   */
+  def instr(str: Column, substring: String): Column = builtin("REGEXP_INSTR")(str, substring)
+
+  /**
+   * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in UTC, and renders
+   * that time as a timestamp in the given time zone. For example, 'GMT+1' would yield
+   * '2017-07-14 03:40:00.0'.
+   * ALTER SESSION SET TIMEZONE = 'America/Los_Angeles';
+   *  SELECT TO_TIMESTAMP_TZ('2024-04-05 01:02:03');
+   *   +----------------------------------------+
+   *  | TO_TIMESTAMP_TZ('2024-04-05 01:02:03') |
+   *  |----------------------------------------|
+   *  | 2024-04-05 01:02:03.000 -0700          |
+   *  +----------------------------------------+
+   *
+   * @since 1.15.0
+   * @param ts A date, timestamp or string. If a string, the data must be in a format that can be
+   *           cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS`
+   *           A string detailing the time zone ID that the input should be adjusted to. It should
+   *           be in the format of either region-based zone IDs or zone offsets. Region IDs must
+   *           have the form 'area/city', such as 'America/Los_Angeles'. Zone offsets must be in
+   *           the format '(+|-)HH:mm', for example '-08:00' or '+01:00'. Also 'UTC' and 'Z' are
+   *           supported as aliases of '+00:00'. Other short names are not recommended to use
+   *           because they can be ambiguous.
+   * @return A timestamp, or null if `ts` was a string that could not be cast to a timestamp or
+   *         `tz` was an invalid value
+   */
+  def from_utc_timestamp(ts: Column): Column =
+    builtin("TO_TIMESTAMP_TZ")(ts)
+
+  /**
+   * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in the given time
+   * zone, and renders that time as a timestamp in UTC. For example, 'GMT+1' would yield
+   * '2017-07-14 01:40:00.0'.
+   * @since 1.15.0
+   * @param ts A date, timestamp or string. If a string, the data must be in a format that can be
+   *           cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS`
+   *           A string detailing the time zone ID that the input should be adjusted to. It should
+   *           be in the format of either region-based zone IDs or zone offsets. Region IDs must
+   *           have the form 'area/city', such as 'America/Los_Angeles'. Zone offsets must be in
+   *           the format '(+|-)HH:mm', for example '-08:00' or '+01:00'. Also 'UTC' and 'Z' are
+   *           supported as aliases of '+00:00'. Other short names are not recommended to use
+   *           because they can be ambiguous.
+   * @return A timestamp, or null if `ts` was a string that could not be cast to a timestamp or
+   *         `tz` was an invalid value
+   */
+  def to_utc_timestamp(ts: Column): Column = builtin("TO_TIMESTAMP_TZ")(ts)
+
+  /**
+   * Formats numeric column x to a format like '#,###,###.##', rounded to d decimal places
+   * with HALF_EVEN round mode, and returns the result as a string column.
+   * @since 1.15.0
+   * If d is 0, the result has no decimal point or fractional part.
+   * If d is less than 0, the result will be null.
+   *
+   * @param x numeric column to be transformed
+   * @param d Amount of decimal for the number format
+   *
+   * @return Number casted to the specific string format
+   */
+  def format_number(x: Column, d: Int): Column = {
+    if (d < 0) {
+      lit(null)
+    } else {
+      builtin("TO_VARCHAR")(x, if (d > 0) s"999,999.${"0" * d}" else "999,999")
+    }
+  }
   /* Returns a Column expression with values sorted in descending order.
    * Example:
    * {{{

diff --git a/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java b/src/test/java/com/snowflake/snowpark_test/JavaFunctionSuite.java
@@ -3135,4 +3135,55 @@ public void unhex() {
     Row[] expected = {Row.create("1"), Row.create("2"), Row.create("3")};
     checkAnswer(df.select(Functions.unhex(Functions.col("a"))), expected, false);
   }
+
+  @Test
+  public void months_between() {
+    DataFrame df =
+        getSession()
+            .sql(
+                "select * from values('2010-07-02'::Date,'2010-08-02'::Date), "
+                    + "('2020-08-02'::Date,'2020-12-02'::Date) as t(a,b)");
+    Row[] expected = {Row.create(1.000000), Row.create(4.000000)};
+    checkAnswer(df.select(Functions.months_between("b", "a")), expected, false);
+  }
+
+  @Test
+  public void instr() {
+    DataFrame df =
+        getSession()
+            .sql(
+                "select * from values('It was the best of times, it was the worst of times') as t(a)");
+    Row[] expected = {Row.create(4)};
+    checkAnswer(df.select(Functions.instr(df.col("a"), "was")), expected, false);
+  }
+
+  @Test
+  public void format_number1() {
+    DataFrame df = getSession().sql("select * from values(1),(2),(3) as t(a)");
+    Row[] expected = {Row.create("1"), Row.create("2"), Row.create("3")};
+    checkAnswer(
+        df.select(Functions.ltrim(Functions.format_number(df.col("a"), 0))), expected, false);
+  }
+
+  @Test
+  public void format_number2() {
+    DataFrame df = getSession().sql("select * from values(1),(2),(3) as t(a)");
+    Row[] expected = {Row.create("1.00"), Row.create("2.00"), Row.create("3.00")};
+    checkAnswer(
+        df.select(Functions.ltrim(Functions.format_number(df.col("a"), 2))), expected, false);
+  }
+
+  @Test
+  public void from_utc_timestamp() {
+    DataFrame df = getSession().sql("select * from values('2024-04-05 01:02:03') as t(a)");
+    Row[] expected = {Row.create(Timestamp.valueOf("2024-04-05 01:02:03.0"))};
+    checkAnswer(df.select(Functions.from_utc_timestamp(df.col("a"))), expected, false);
+  }
+
+  @Test
+  public void to_utc_timestamp() {
+    DataFrame df = getSession().sql("select * from values('2024-04-05 01:02:03') as t(a)");
+    Row[] expected = {Row.create(Timestamp.valueOf("2024-04-05 01:02:03.0"))};
+    checkAnswer(df.select(Functions.to_utc_timestamp(df.col("a"))), expected, false);
+  }
 }
diff --git a/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala b/src/test/scala/com/snowflake/snowpark_test/FunctionSuite.scala
@@ -2466,6 +2466,58 @@ trait FunctionSuite extends TestData {
       Seq(Row("1"), Row("2"), Row("3")),
       sort = false)
   }
+  test("months_between") {
+    val months_between = functions.builtin("MONTHS_BETWEEN")
+    val input = Seq(
+      (Date.valueOf("2010-08-02"), Date.valueOf("2010-07-02")),
+      (Date.valueOf("2020-12-02"), Date.valueOf("2020-08-02")))
+      .toDF("a", "b")
+    checkAnswer(
+      input.select(months_between(col("a"), col("b"))),
+      Seq(Row((1.000000)), Row(4.000000)),
+      sort = false)
+  }
+
+  test("instr") {
+    val df = Seq("It was the best of times, it was the worst of times").toDF("a")
+    checkAnswer(df.select(instr(col("a"), "was")), Seq(Row(4)), sort = false)
+  }
+
+  test("format_number1") {
+
+    checkAnswer(
+      number3.select(ltrim(format_number(col("a"), 0))),
+      Seq(Row(("1")), Row(("2")), Row(("3"))),
+      sort = false)
+  }
+
+  test("format_number2") {
+
+    checkAnswer(
+      number3.select(ltrim(format_number(col("a"), 2))),
+      Seq(Row(("1.00")), Row(("2.00")), Row(("3.00"))),
+      sort = false)
+  }
+
+  test("format_number3") {
+
+    checkAnswer(
+      number3.select(ltrim(format_number(col("a"), -1))),
+      Seq(Row((null)), Row((null)), Row((null))),
+      sort = false)
+  }
+
+  test("from_utc_timestamp") {
+    val expected = Seq(Timestamp.valueOf("2024-04-05 01:02:03.0")).toDF("a")
+    val data = Seq("2024-04-05 01:02:03").toDF("a")
+    checkAnswer(data.select(from_utc_timestamp(col("a"))), expected, sort = false)
+  }
+
+  test("to_utc_timestamp") {
+    val expected = Seq(Timestamp.valueOf("2024-04-05 01:02:03.0")).toDF("a")
+    val data = Seq("2024-04-05 01:02:03").toDF("a")
+    checkAnswer(data.select(to_utc_timestamp(col("a"))), expected, sort = false)
+  }
 
 }