snowflakedb · sfc-gh-helmeleegy · Oct 28, 2024 · Oct 24, 2024 · Oct 25, 2024 · Oct 25, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -44,6 +44,7 @@
 - Added support for applying Snowpark Python function `snowflake_cortex_summarize`.
 - Added support for `DataFrame.attrs` and `Series.attrs`.
 - Added support for `DataFrame.style`.
+- Added support for `Index.to_numpy`.
 
 #### Improvements
 

@@ -155,6 +155,8 @@ Methods
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``to_frame``                | Y                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
+| ``to_numpy``                | Y                               |                                  |                                                    |
++-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``view``                    | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``argsort``                 | N                               |                                  |                                                    |

@@ -384,8 +384,12 @@ def column_astype(
             new_col = cast(curr_col, LongType())
     else:
         new_col = cast(curr_col, to_sf_type)
-    # astype should not have any effect on NULL values
-    return iff(curr_col.is_null(), None, new_col)
+    # astype should not have any effect on NULL values except when casting to boolean
+    if isinstance(to_sf_type, BooleanType):
+        # treat NULL values in boolean columns as False to match pandas behavior
+        return iff(curr_col.is_null(), pandas_lit(False), new_col)
+    else:
+        return iff(curr_col.is_null(), None, new_col)
 
 
 def is_astype_type_error(

@@ -14,7 +14,12 @@
 import pandas as native_pd
 from pandas._typing import AnyArrayLike, Scalar
 from pandas.core.dtypes.base import ExtensionDtype
-from pandas.core.dtypes.common import is_integer_dtype, is_object_dtype, is_scalar
+from pandas.core.dtypes.common import (
+    is_bool_dtype,
+    is_integer_dtype,
+    is_object_dtype,
+    is_scalar,
+)
 from pandas.core.dtypes.inference import is_list_like
 
 import snowflake.snowpark.modin.plugin._internal.statement_params_constants as STATEMENT_PARAMS
@@ -1557,7 +1562,14 @@ def convert_str_to_timedelta(x: str) -> pd.Timedelta:
             # example, an empty dataframe will be object dtype by default, or a variant, or a timestamp column with
             # multiple timezones. So here we cast the index to the index_type when ret = pd.Index(...) above cannot
             # figure out a non-object dtype. Note that the index_type is a logical type may not be 100% accurate.
-            if is_object_dtype(ret.dtype) and not is_object_dtype(index_type):
+            # We exclude the case where ret.dtype is object dtype while index_dtype is bool dtype. This is because
+            # casting None values to bool converts them to False, which results in a descripency with the pandas
+            # behavior.
+            if (
+                is_object_dtype(ret.dtype)
+                and not is_object_dtype(index_type)
+                and not is_bool_dtype(index_type)
+            ):
                 # TODO: SNOW-1657460 fix index_type for timestamp_tz
                 try:
                     ret = ret.astype(index_type)

@@ -3383,6 +3383,57 @@ def to_numpy():
         Returns
         -------
         numpy.ndarray
+
+        See Also
+        --------
+        Series.array
+            Get the actual data stored within.
+        Index.array
+            Get the actual data stored within.
+        DataFrame.to_numpy
+            Similar method for DataFrame.
+
+        Notes
+        -----
+        The returned array will be the same up to equality (values equal in self will be equal in the returned array; likewise for values that are not equal). When self contains an ExtensionArray, the dtype may be different. For example, for a category-dtype Series, to_numpy() will return a NumPy array and the categorical dtype will be lost.
+
+        This table lays out the different dtypes and default return types of to_numpy() for various dtypes within pandas.
+
+        +--------------------+----------------------------------+
+        | dtype              | array type                       |
+        +--------------------+----------------------------------+
+        | category[T]        | ndarray[T] (same dtype as input) |
+        +--------------------+----------------------------------+
+        | period             | ndarray[object] (Periods)        |
+        +--------------------+----------------------------------+
+        | interval           | ndarray[object] (Intervals)      |
+        +--------------------+----------------------------------+
+        | IntegerNA          | ndarray[object]                  |
+        +--------------------+----------------------------------+
+        | datetime64[ns]     | datetime64[ns]                   |
+        +--------------------+----------------------------------+
+        | datetime64[ns, tz] | ndarray[object] (Timestamps)     |
+        +--------------------+----------------------------------+
+
+        Examples
+        --------
+        >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))  # doctest: +SKIP
+        >>> ser.to_numpy()  # doctest: +SKIP
+        array(['a', 'b', 'a'], dtype=object)
+
+        Specify the dtype to control how datetime-aware data is represented. Use dtype=object to return an ndarray of pandas Timestamp objects, each with the correct tz.
+
+        >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
+        >>> ser.to_numpy(dtype=object)
+        array([Timestamp('2000-01-01 00:00:00+0100', tz='UTC+01:00'),
+            Timestamp('2000-01-02 00:00:00+0100', tz='UTC+01:00')],
+            dtype=object)
+
+        Or dtype='datetime64[ns]' to return an ndarray of native datetime64 values. The values are converted to UTC and the timezone info is dropped.
+
+        >>> ser.to_numpy(dtype="datetime64[ns]")
+        array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
+            dtype='datetime64[ns]')
         """
 
     tolist = to_list

@@ -29,12 +29,13 @@
 
 import modin
 import numpy as np
+import numpy.typing as npt
 import pandas as native_pd
 from modin.pandas import DataFrame, Series
 from modin.pandas.base import BasePandasDataset
 from pandas import get_option
 from pandas._libs import lib
-from pandas._libs.lib import is_list_like, is_scalar
+from pandas._libs.lib import is_list_like, is_scalar, no_default
 from pandas._typing import ArrayLike, DateTimeErrorChoices, DtypeObj, NaPosition, Scalar
 from pandas.core.arrays import ExtensionArray
 from pandas.core.dtypes.base import ExtensionDtype
@@ -2079,6 +2080,102 @@ def to_frame(
 
         return DataFrame(query_compiler=new_qc)
 
+    def to_numpy(
+        self,
+        dtype: npt.DTypeLike | None = None,
+        copy: bool = False,
+        na_value: object = no_default,
+        **kwargs: Any,
+    ) -> np.ndarray:
+        """
+        A NumPy ndarray representing the values in this Series or Index.
+
+        Parameters
+        ----------
+        dtype : str or numpy.dtype, optional
+            The dtype to pass to :meth:`numpy.asarray`.
+        copy : bool, default False
+            This argument is ignored in Snowflake backend. The data from Snowflake
+            will be retrieved into the client, and a numpy array containing this
+            data will be returned.
+        na_value : Any, optional
+            The value to use for missing values. The default value depends
+            on `dtype` and the type of the array.
+        **kwargs
+            Additional keywords passed through to the ``to_numpy`` method
+            of the underlying array (for extension arrays).
+
+        Returns
+        -------
+        numpy.ndarray
+
+        See Also
+        --------
+        Series.array
+            Get the actual data stored within.
+        Index.array
+            Get the actual data stored within.
+        DataFrame.to_numpy
+            Similar method for DataFrame.
+
+        Notes
+        -----
+        The returned array will be the same up to equality (values equal in self will be equal in the returned array; likewise for values that are not equal). When self contains an ExtensionArray, the dtype may be different. For example, for a category-dtype Series, to_numpy() will return a NumPy array and the categorical dtype will be lost.
+
+        This table lays out the different dtypes and default return types of to_numpy() for various dtypes within pandas.
+
+        +--------------------+----------------------------------+
+        | dtype              | array type                       |
+        +--------------------+----------------------------------+
+        | category[T]        | ndarray[T] (same dtype as input) |
+        +--------------------+----------------------------------+
+        | period             | ndarray[object] (Periods)        |
+        +--------------------+----------------------------------+
+        | interval           | ndarray[object] (Intervals)      |
+        +--------------------+----------------------------------+
+        | IntegerNA          | ndarray[object]                  |
+        +--------------------+----------------------------------+
+        | datetime64[ns]     | datetime64[ns]                   |
+        +--------------------+----------------------------------+
+        | datetime64[ns, tz] | ndarray[object] (Timestamps)     |
+        +--------------------+----------------------------------+
+
+        Examples
+        --------
+        >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))  # doctest: +SKIP
+        >>> ser.to_numpy()  # doctest: +SKIP
+        array(['a', 'b', 'a'], dtype=object)
+
+        Specify the dtype to control how datetime-aware data is represented. Use dtype=object to return an ndarray of pandas Timestamp objects, each with the correct tz.
+
+        >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
+        >>> ser.to_numpy(dtype=object)
+        array([Timestamp('2000-01-01 00:00:00+0100', tz='UTC+01:00'),
+            Timestamp('2000-01-02 00:00:00+0100', tz='UTC+01:00')],
+            dtype=object)
+
+        Or dtype='datetime64[ns]' to return an ndarray of native datetime64 values. The values are converted to UTC and the timezone info is dropped.
+
+        >>> ser.to_numpy(dtype="datetime64[ns]")
+        array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
+            dtype='datetime64[ns]')
+        """
+        if copy:
+            WarningMessage.ignored_argument(
+                operation="to_numpy",
+                argument="copy",
+                message="copy is ignored in Snowflake backend",
+            )
+        return (
+            self.to_pandas()
+            .to_numpy(
+                dtype=dtype,
+                na_value=na_value,
+                **kwargs,
+            )
+            .flatten()
+        )
+
     @index_not_implemented()
     def fillna(self) -> None:
         """
@@ -2601,6 +2698,10 @@ def __array__(self, dtype: Any = None) -> np.ndarray:
         """
         The array interface, return the values.
         """
+        # Ensure that the existing index dtype is preserved in the returned array
+        # if no other dtype is given.
+        if dtype is None:
+            dtype = self.dtype
         return self.to_pandas().__array__(dtype=dtype)
 
     def __repr__(self) -> str:

@@ -36,31 +36,28 @@
         np.array([], dtype=bool),
     ],
 )
+@sql_count_checker(query_count=1, join_count=1)
 def test_df_getitem_with_boolean_list_like(
     key, default_index_snowpark_pandas_df, default_index_native_df
 ):
-    # one added query to convert to native pandas and 1 added query for series initialization
-    with SqlCounter(
-        query_count=3 if isinstance(key, native_pd.Index) else 1, join_count=1
-    ):
-        # df[boolean list-like key] is the same as df.loc[:, boolean list-like key]
-        if isinstance(key, native_pd.Index):
-            key = pd.Index(key)
-
-        def get_helper(df):
-            if isinstance(df, pd.DataFrame):
-                return df[key]
-            # If pandas df, adjust the length of the df and key since boolean keys need to be the same length as the axis.
-            _key = try_convert_index_to_native(key)
-            _df = df.iloc[: len(key)]
-            _key = _key[: _df.shape[1]]
-            return _df[_key]
+    # df[boolean list-like key] is the same as df.loc[:, boolean list-like key]
 
-        eval_snowpark_pandas_result(
-            default_index_snowpark_pandas_df,
-            default_index_native_df,
-            get_helper,
-        )
+    def get_helper(df, key):
+        if isinstance(df, pd.DataFrame):
+            if isinstance(key, native_pd.Index):
+                key = pd.Index(key)
+            return df[key]
+        # If pandas df, adjust the length of the df and key since boolean keys need to be the same length as the axis.
+        _key = try_convert_index_to_native(key)
+        _df = df.iloc[: len(key)]
+        _key = _key[: _df.shape[1]]
+        return _df[_key]
+
+    eval_snowpark_pandas_result(
+        default_index_snowpark_pandas_df,
+        default_index_native_df,
+        lambda df: get_helper(df, key),
+    )
 
 
 @pytest.mark.parametrize(

@@ -104,7 +104,11 @@ def test_index_astype_empty_index(from_type, to_type):
     native_index = native_pd.Index([], dtype=from_type)
     snow_index = pd.Index(native_index)
     with SqlCounter(query_count=1):
-        assert_index_equal(snow_index.astype(to_type), native_index.astype(to_type))
+        # exact=False is used because of a discrepancy in the "inferred_type" attribute
+        # when to_type is bool between Snowpark pandas (empty) and native pandas (bool).
+        assert_index_equal(
+            snow_index.astype(to_type), native_index.astype(to_type), exact=False
+        )
 
 
 @pytest.mark.parametrize(

@@ -42,12 +42,15 @@
         [datetime.datetime(2023, 1, 1), datetime.datetime(2023, 1, 1, 1, 2, 3), None],
     ],
 )
-@pytest.mark.parametrize("pandas_obj", ["DataFrame", "Series"])
+@pytest.mark.parametrize("pandas_obj", ["DataFrame", "Series", "Index"])
 @pytest.mark.parametrize("func", ["to_numpy", "values"])
 def test_to_numpy_basic(data, pandas_obj, func):
     if pandas_obj == "Series":
         df = pd.Series(data)
         native_df = native_pd.Series(data)
+    elif pandas_obj == "Index":
+        df = pd.Index(data)
+        native_df = native_pd.Index(data)
     else:
         df = pd.DataFrame([data, data])
         native_df = native_pd.DataFrame([data, data])
@@ -109,7 +112,7 @@ def test_tz_aware_data_to_numpy(session):
     assert_array_equal(df.to_numpy(), expected_result)
 
 
-@pytest.mark.parametrize("pandas_obj", ["DataFrame", "Series"])
+@pytest.mark.parametrize("pandas_obj", ["DataFrame", "Series", "Index"])
 @sql_count_checker(query_count=1)
 def test_variant_data_to_numpy(pandas_obj):
     data = [