From 300d2f9d9f6dd35805c3bcb6504766840cc8ea78 Mon Sep 17 00:00:00 2001 From: Hazem Elmeleegy Date: Thu, 24 Oct 2024 01:57:22 -0700 Subject: [PATCH 1/7] SNOW-1707707: Add support for Index.to_numpy --- CHANGELOG.md | 1 + .../modin/supported/index_supported.rst | 2 + .../snowpark/modin/plugin/_internal/utils.py | 16 ++- .../modin/plugin/docstrings/series.py | 55 +++++++++ .../snowpark/modin/plugin/extensions/index.py | 104 +++++++++++++++++- tests/integ/modin/test_to_numpy.py | 7 +- 6 files changed, 180 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ce7fee72f4..7e1aece4b80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,7 @@ - Added support for timedelta inputs in `value_counts()`. - Added support for applying Snowpark Python function `snowflake_cortex_summarize`. - Added support for `DataFrame`/`Series.attrs` +- Added support for `Index.to_numpy`. #### Improvements diff --git a/docs/source/modin/supported/index_supported.rst b/docs/source/modin/supported/index_supported.rst index b67bd0e18a4..ffde21ee377 100644 --- a/docs/source/modin/supported/index_supported.rst +++ b/docs/source/modin/supported/index_supported.rst @@ -155,6 +155,8 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``to_frame`` | Y | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ +| ``to_numpy`` | Y | | | ++-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``view`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``argsort`` | N | | | diff --git a/src/snowflake/snowpark/modin/plugin/_internal/utils.py b/src/snowflake/snowpark/modin/plugin/_internal/utils.py index 3f01e2d095e..1366a79918b 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/utils.py @@ -14,7 +14,12 @@ import pandas as native_pd from pandas._typing import AnyArrayLike, Scalar from pandas.core.dtypes.base import ExtensionDtype -from pandas.core.dtypes.common import is_integer_dtype, is_object_dtype, is_scalar +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_integer_dtype, + is_object_dtype, + is_scalar, +) from pandas.core.dtypes.inference import is_list_like import snowflake.snowpark.modin.plugin._internal.statement_params_constants as STATEMENT_PARAMS @@ -1557,7 +1562,14 @@ def convert_str_to_timedelta(x: str) -> pd.Timedelta: # example, an empty dataframe will be object dtype by default, or a variant, or a timestamp column with # multiple timezones. So here we cast the index to the index_type when ret = pd.Index(...) above cannot # figure out a non-object dtype. Note that the index_type is a logical type may not be 100% accurate. - if is_object_dtype(ret.dtype) and not is_object_dtype(index_type): + # We exclude the case where ret.dtype is object dtype while index_dtype is bool dtype. This is because + # casting None values to bool converts them to False, which results in a descripency with the pandas + # behavior. + if ( + is_object_dtype(ret.dtype) + and not is_object_dtype(index_type) + and not is_bool_dtype(index_type) + ): # TODO: SNOW-1657460 fix index_type for timestamp_tz try: ret = ret.astype(index_type) diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series.py b/src/snowflake/snowpark/modin/plugin/docstrings/series.py index 4b0ea8f748c..08091389706 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/series.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/series.py @@ -3383,6 +3383,61 @@ def to_numpy(): Returns ------- numpy.ndarray + + See Also + -------- + Series.array + Get the actual data stored within. + Index.array + Get the actual data stored within. + DataFrame.to_numpy + Similar method for DataFrame. + + Notes + ----- + The returned array will be the same up to equality (values equal in self will be equal in the returned array; likewise for values that are not equal). When self contains an ExtensionArray, the dtype may be different. For example, for a category-dtype Series, to_numpy() will return a NumPy array and the categorical dtype will be lost. + + For NumPy dtypes, this will be a reference to the actual data stored in this Series or Index (assuming copy=False). Modifying the result in place will modify the data stored in the Series or Index (not that we recommend doing that). + + For extension types, to_numpy() may require copying data and coercing the result to a NumPy type (possibly object), which may be expensive. When you need a no-copy reference to the underlying data, Series.array should be used instead. + + This table lays out the different dtypes and default return types of to_numpy() for various dtypes within pandas. + + --------------------------------------------------------- + | dtype | array type | + --------------------------------------------------------- + | category[T] | ndarray[T] (same dtype as input) | + --------------------------------------------------------- + | period | ndarray[object] (Periods) | + --------------------------------------------------------- + | interval | ndarray[object] (Intervals) | + --------------------------------------------------------- + | IntegerNA | ndarray[object] | + --------------------------------------------------------- + | datetime64[ns] | datetime64[ns] | + --------------------------------------------------------- + | datetime64[ns, tz] | ndarray[object] (Timestamps) | + --------------------------------------------------------- + + Examples + -------- + >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) # doctest: +SKIP + >>> ser.to_numpy() # doctest: +SKIP + array(['a', 'b', 'a'], dtype=object) + + Specify the dtype to control how datetime-aware data is represented. Use dtype=object to return an ndarray of pandas Timestamp objects, each with the correct tz. + + >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) + >>> ser.to_numpy(dtype=object) + array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'), + Timestamp('2000-01-02 00:00:00+0100', tz='CET')], + dtype=object) + + Or dtype='datetime64[ns]' to return an ndarray of native datetime64 values. The values are converted to UTC and the timezone info is dropped. + + >>> ser.to_numpy(dtype="datetime64[ns]") + array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'], + dtype='datetime64[ns]') """ tolist = to_list diff --git a/src/snowflake/snowpark/modin/plugin/extensions/index.py b/src/snowflake/snowpark/modin/plugin/extensions/index.py index 1513fe86e08..a0e86c2d82a 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/index.py @@ -29,12 +29,13 @@ import modin import numpy as np +import numpy.typing as npt import pandas as native_pd from modin.pandas import DataFrame, Series from modin.pandas.base import BasePandasDataset from pandas import get_option from pandas._libs import lib -from pandas._libs.lib import is_list_like, is_scalar +from pandas._libs.lib import is_list_like, is_scalar, no_default from pandas._typing import ArrayLike, DateTimeErrorChoices, DtypeObj, NaPosition, Scalar from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.base import ExtensionDtype @@ -2079,6 +2080,107 @@ def to_frame( return DataFrame(query_compiler=new_qc) + def to_numpy( + self, + dtype: npt.DTypeLike | None = None, + copy: bool = False, + na_value: object = no_default, + **kwargs: Any, + ) -> np.ndarray: + """ + A NumPy ndarray representing the values in this Series or Index. + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to pass to :meth:`numpy.asarray`. + copy : bool, default False + This argument is ignored in Snowflake backend. The data from Snowflake + will be retrieved into the client, and a numpy array containing this + data will be returned. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the type of the array. + **kwargs + Additional keywords passed through to the ``to_numpy`` method + of the underlying array (for extension arrays). + + Returns + ------- + numpy.ndarray + + See Also + -------- + Series.array + Get the actual data stored within. + Index.array + Get the actual data stored within. + DataFrame.to_numpy + Similar method for DataFrame. + + Notes + ----- + The returned array will be the same up to equality (values equal in self will be equal in the returned array; likewise for values that are not equal). When self contains an ExtensionArray, the dtype may be different. For example, for a category-dtype Series, to_numpy() will return a NumPy array and the categorical dtype will be lost. + + For NumPy dtypes, this will be a reference to the actual data stored in this Series or Index (assuming copy=False). Modifying the result in place will modify the data stored in the Series or Index (not that we recommend doing that). + + For extension types, to_numpy() may require copying data and coercing the result to a NumPy type (possibly object), which may be expensive. When you need a no-copy reference to the underlying data, Series.array should be used instead. + + This table lays out the different dtypes and default return types of to_numpy() for various dtypes within pandas. + + --------------------------------------------------------- + | dtype | array type | + --------------------------------------------------------- + | category[T] | ndarray[T] (same dtype as input) | + --------------------------------------------------------- + | period | ndarray[object] (Periods) | + --------------------------------------------------------- + | interval | ndarray[object] (Intervals) | + --------------------------------------------------------- + | IntegerNA | ndarray[object] | + --------------------------------------------------------- + | datetime64[ns] | datetime64[ns] | + --------------------------------------------------------- + | datetime64[ns, tz] | ndarray[object] (Timestamps) | + --------------------------------------------------------- + + Examples + -------- + >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) # doctest: +SKIP + >>> ser.to_numpy() # doctest: +SKIP + array(['a', 'b', 'a'], dtype=object) + + Specify the dtype to control how datetime-aware data is represented. Use dtype=object to return an ndarray of pandas Timestamp objects, each with the correct tz. + + >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) + >>> ser.to_numpy(dtype=object) + array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'), + Timestamp('2000-01-02 00:00:00+0100', tz='CET')], + dtype=object) + + Or dtype='datetime64[ns]' to return an ndarray of native datetime64 values. The values are converted to UTC and the timezone info is dropped. + + >>> ser.to_numpy(dtype="datetime64[ns]") + array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'], + dtype='datetime64[ns]') + """ + if copy: + WarningMessage.ignored_argument( + operation="to_numpy", + argument="copy", + message="copy is ignored in Snowflake backend", + ) + # return self.to_pandas().array + return ( + self.to_pandas() + .to_numpy( + dtype=dtype, + na_value=na_value, + **kwargs, + ) + .flatten() + ) + @index_not_implemented() def fillna(self) -> None: """ diff --git a/tests/integ/modin/test_to_numpy.py b/tests/integ/modin/test_to_numpy.py index 33829d5c39b..0e61f595e4e 100644 --- a/tests/integ/modin/test_to_numpy.py +++ b/tests/integ/modin/test_to_numpy.py @@ -42,12 +42,15 @@ [datetime.datetime(2023, 1, 1), datetime.datetime(2023, 1, 1, 1, 2, 3), None], ], ) -@pytest.mark.parametrize("pandas_obj", ["DataFrame", "Series"]) +@pytest.mark.parametrize("pandas_obj", ["DataFrame", "Series", "Index"]) @pytest.mark.parametrize("func", ["to_numpy", "values"]) def test_to_numpy_basic(data, pandas_obj, func): if pandas_obj == "Series": df = pd.Series(data) native_df = native_pd.Series(data) + elif pandas_obj == "Index": + df = pd.Index(data) + native_df = native_pd.Index(data) else: df = pd.DataFrame([data, data]) native_df = native_pd.DataFrame([data, data]) @@ -109,7 +112,7 @@ def test_tz_aware_data_to_numpy(session): assert_array_equal(df.to_numpy(), expected_result) -@pytest.mark.parametrize("pandas_obj", ["DataFrame", "Series"]) +@pytest.mark.parametrize("pandas_obj", ["DataFrame", "Series", "Index"]) @sql_count_checker(query_count=1) def test_variant_data_to_numpy(pandas_obj): data = [ From b57bdd5aa9bc8e8a63d2e07679e895e34a1f7c01 Mon Sep 17 00:00:00 2001 From: Hazem Elmeleegy Date: Fri, 25 Oct 2024 11:21:09 -0700 Subject: [PATCH 2/7] fix tests --- .../modin/plugin/_internal/type_utils.py | 8 +++- .../modin/plugin/docstrings/series.py | 8 +--- .../snowpark/modin/plugin/extensions/index.py | 13 +++---- tests/integ/modin/frame/test_getitem.py | 39 +++++++++---------- tests/integ/modin/index/test_astype.py | 12 +++--- 5 files changed, 39 insertions(+), 41 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/type_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/type_utils.py index 1b55d3af611..404e5ffd76a 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/type_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/type_utils.py @@ -384,8 +384,12 @@ def column_astype( new_col = cast(curr_col, LongType()) else: new_col = cast(curr_col, to_sf_type) - # astype should not have any effect on NULL values - return iff(curr_col.is_null(), None, new_col) + # astype should not have any effect on NULL values except when casting to boolean + if isinstance(to_sf_type, BooleanType): + # treat NULL values in boolean columns as False to match pandas behavior + return iff(curr_col.is_null(), pandas_lit(False), new_col) + else: + return iff(curr_col.is_null(), None, new_col) def is_astype_type_error( diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series.py b/src/snowflake/snowpark/modin/plugin/docstrings/series.py index 08091389706..976d2b01c24 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/series.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/series.py @@ -3397,10 +3397,6 @@ def to_numpy(): ----- The returned array will be the same up to equality (values equal in self will be equal in the returned array; likewise for values that are not equal). When self contains an ExtensionArray, the dtype may be different. For example, for a category-dtype Series, to_numpy() will return a NumPy array and the categorical dtype will be lost. - For NumPy dtypes, this will be a reference to the actual data stored in this Series or Index (assuming copy=False). Modifying the result in place will modify the data stored in the Series or Index (not that we recommend doing that). - - For extension types, to_numpy() may require copying data and coercing the result to a NumPy type (possibly object), which may be expensive. When you need a no-copy reference to the underlying data, Series.array should be used instead. - This table lays out the different dtypes and default return types of to_numpy() for various dtypes within pandas. --------------------------------------------------------- @@ -3429,8 +3425,8 @@ def to_numpy(): >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) >>> ser.to_numpy(dtype=object) - array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'), - Timestamp('2000-01-02 00:00:00+0100', tz='CET')], + array([Timestamp('2000-01-01 00:00:00+0100', tz='UTC+01:00'), + Timestamp('2000-01-02 00:00:00+0100', tz='UTC+01:00')], dtype=object) Or dtype='datetime64[ns]' to return an ndarray of native datetime64 values. The values are converted to UTC and the timezone info is dropped. diff --git a/src/snowflake/snowpark/modin/plugin/extensions/index.py b/src/snowflake/snowpark/modin/plugin/extensions/index.py index a0e86c2d82a..32316e5f08f 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/index.py @@ -2122,10 +2122,6 @@ def to_numpy( ----- The returned array will be the same up to equality (values equal in self will be equal in the returned array; likewise for values that are not equal). When self contains an ExtensionArray, the dtype may be different. For example, for a category-dtype Series, to_numpy() will return a NumPy array and the categorical dtype will be lost. - For NumPy dtypes, this will be a reference to the actual data stored in this Series or Index (assuming copy=False). Modifying the result in place will modify the data stored in the Series or Index (not that we recommend doing that). - - For extension types, to_numpy() may require copying data and coercing the result to a NumPy type (possibly object), which may be expensive. When you need a no-copy reference to the underlying data, Series.array should be used instead. - This table lays out the different dtypes and default return types of to_numpy() for various dtypes within pandas. --------------------------------------------------------- @@ -2154,8 +2150,8 @@ def to_numpy( >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) >>> ser.to_numpy(dtype=object) - array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'), - Timestamp('2000-01-02 00:00:00+0100', tz='CET')], + array([Timestamp('2000-01-01 00:00:00+0100', tz='UTC+01:00'), + Timestamp('2000-01-02 00:00:00+0100', tz='UTC+01:00')], dtype=object) Or dtype='datetime64[ns]' to return an ndarray of native datetime64 values. The values are converted to UTC and the timezone info is dropped. @@ -2170,7 +2166,6 @@ def to_numpy( argument="copy", message="copy is ignored in Snowflake backend", ) - # return self.to_pandas().array return ( self.to_pandas() .to_numpy( @@ -2703,6 +2698,10 @@ def __array__(self, dtype: Any = None) -> np.ndarray: """ The array interface, return the values. """ + # Ensure that the existing index dtype is preserved in the returned array + # if no other dtype is given. + if dtype is None: + dtype = self.dtype return self.to_pandas().__array__(dtype=dtype) def __repr__(self) -> str: diff --git a/tests/integ/modin/frame/test_getitem.py b/tests/integ/modin/frame/test_getitem.py index 03923701dd7..8abd281e541 100644 --- a/tests/integ/modin/frame/test_getitem.py +++ b/tests/integ/modin/frame/test_getitem.py @@ -36,31 +36,28 @@ np.array([], dtype=bool), ], ) +@sql_count_checker(query_count=1, join_count=1) def test_df_getitem_with_boolean_list_like( key, default_index_snowpark_pandas_df, default_index_native_df ): - # one added query to convert to native pandas and 1 added query for series initialization - with SqlCounter( - query_count=3 if isinstance(key, native_pd.Index) else 1, join_count=1 - ): - # df[boolean list-like key] is the same as df.loc[:, boolean list-like key] - if isinstance(key, native_pd.Index): - key = pd.Index(key) - - def get_helper(df): - if isinstance(df, pd.DataFrame): - return df[key] - # If pandas df, adjust the length of the df and key since boolean keys need to be the same length as the axis. - _key = try_convert_index_to_native(key) - _df = df.iloc[: len(key)] - _key = _key[: _df.shape[1]] - return _df[_key] + # df[boolean list-like key] is the same as df.loc[:, boolean list-like key] - eval_snowpark_pandas_result( - default_index_snowpark_pandas_df, - default_index_native_df, - get_helper, - ) + def get_helper(df, key): + if isinstance(df, pd.DataFrame): + if isinstance(key, native_pd.Index): + key = pd.Index(key) + return df[key] + # If pandas df, adjust the length of the df and key since boolean keys need to be the same length as the axis. + _key = try_convert_index_to_native(key) + _df = df.iloc[: len(key)] + _key = _key[: _df.shape[1]] + return _df[_key] + + eval_snowpark_pandas_result( + default_index_snowpark_pandas_df, + default_index_native_df, + lambda df: get_helper(df, key), + ) @pytest.mark.parametrize( diff --git a/tests/integ/modin/index/test_astype.py b/tests/integ/modin/index/test_astype.py index b4578928825..76cebcec7d6 100644 --- a/tests/integ/modin/index/test_astype.py +++ b/tests/integ/modin/index/test_astype.py @@ -46,7 +46,7 @@ def test_index_astype(index, type): snow_index = pd.Index(index) with SqlCounter(query_count=1): - assert_index_equal(snow_index.astype(type), index.astype(type)) + assert_index_equal(snow_index.astype(type), index.astype(type), exact=False) @pytest.mark.parametrize( @@ -104,7 +104,9 @@ def test_index_astype_empty_index(from_type, to_type): native_index = native_pd.Index([], dtype=from_type) snow_index = pd.Index(native_index) with SqlCounter(query_count=1): - assert_index_equal(snow_index.astype(to_type), native_index.astype(to_type)) + assert_index_equal( + snow_index.astype(to_type), native_index.astype(to_type), exact=False + ) @pytest.mark.parametrize( @@ -166,8 +168,8 @@ def test_index_astype_bool_nan_none(): snow_index = pd.Index(native_index) with pytest.raises(AssertionError): assert_index_equal(snow_index.astype(bool), native_index.astype(bool)) - expected_result = native_pd.Index([True, True, True, False, False], dtype=bool) - assert_index_equal(snow_index.astype(bool), expected_result) + expected_result = native_pd.Index([True, True, True, None, None], dtype=bool) + assert_index_equal(snow_index.astype(bool), expected_result, exact=False) # Another case where this arises is when a float Index with "None" in it is used. pandas # converts None to NaN during Index creation and thus leads to this difference. @@ -180,7 +182,7 @@ def test_index_astype_bool_nan_none(): expected_result = native_pd.Index( [True, True, True, True, False, False], dtype=bool ) - assert_index_equal(snow_index.astype(bool), expected_result) + assert_index_equal(snow_index.astype(bool), expected_result, exact=False) @sql_count_checker(query_count=2) From f7eccfe8d300d35c8e4a8e3d0e48d7ef6b8ce6ba Mon Sep 17 00:00:00 2001 From: Hazem Elmeleegy Date: Fri, 25 Oct 2024 11:44:07 -0700 Subject: [PATCH 3/7] fix errors --- .../snowpark/modin/plugin/docstrings/series.py | 16 ++++++++-------- .../snowpark/modin/plugin/extensions/index.py | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series.py b/src/snowflake/snowpark/modin/plugin/docstrings/series.py index 976d2b01c24..37169ede461 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/series.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/series.py @@ -3399,21 +3399,21 @@ def to_numpy(): This table lays out the different dtypes and default return types of to_numpy() for various dtypes within pandas. - --------------------------------------------------------- + +--------------------+----------------------------------+ | dtype | array type | - --------------------------------------------------------- + +--------------------+----------------------------------+ | category[T] | ndarray[T] (same dtype as input) | - --------------------------------------------------------- + +--------------------+----------------------------------+ | period | ndarray[object] (Periods) | - --------------------------------------------------------- + +--------------------+----------------------------------+ | interval | ndarray[object] (Intervals) | - --------------------------------------------------------- + +--------------------+----------------------------------+ | IntegerNA | ndarray[object] | - --------------------------------------------------------- + +--------------------+----------------------------------+ | datetime64[ns] | datetime64[ns] | - --------------------------------------------------------- + +--------------------+----------------------------------+ | datetime64[ns, tz] | ndarray[object] (Timestamps) | - --------------------------------------------------------- + +--------------------+----------------------------------+ Examples -------- diff --git a/src/snowflake/snowpark/modin/plugin/extensions/index.py b/src/snowflake/snowpark/modin/plugin/extensions/index.py index 32316e5f08f..b7512b34e61 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/index.py @@ -2124,21 +2124,21 @@ def to_numpy( This table lays out the different dtypes and default return types of to_numpy() for various dtypes within pandas. - --------------------------------------------------------- + +--------------------+----------------------------------+ | dtype | array type | - --------------------------------------------------------- + +--------------------+----------------------------------+ | category[T] | ndarray[T] (same dtype as input) | - --------------------------------------------------------- + +--------------------+----------------------------------+ | period | ndarray[object] (Periods) | - --------------------------------------------------------- + +--------------------+----------------------------------+ | interval | ndarray[object] (Intervals) | - --------------------------------------------------------- + +--------------------+----------------------------------+ | IntegerNA | ndarray[object] | - --------------------------------------------------------- + +--------------------+----------------------------------+ | datetime64[ns] | datetime64[ns] | - --------------------------------------------------------- + +--------------------+----------------------------------+ | datetime64[ns, tz] | ndarray[object] (Timestamps) | - --------------------------------------------------------- + +--------------------+----------------------------------+ Examples -------- From e60801afbbd8d7ba153051504d70c580d6985ed3 Mon Sep 17 00:00:00 2001 From: Hazem Elmeleegy Date: Fri, 25 Oct 2024 12:14:27 -0700 Subject: [PATCH 4/7] fix errors --- tests/integ/modin/index/test_astype.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/integ/modin/index/test_astype.py b/tests/integ/modin/index/test_astype.py index 76cebcec7d6..2b7030375ae 100644 --- a/tests/integ/modin/index/test_astype.py +++ b/tests/integ/modin/index/test_astype.py @@ -46,7 +46,7 @@ def test_index_astype(index, type): snow_index = pd.Index(index) with SqlCounter(query_count=1): - assert_index_equal(snow_index.astype(type), index.astype(type), exact=False) + assert_index_equal(snow_index.astype(type), index.astype(type)) @pytest.mark.parametrize( @@ -104,6 +104,8 @@ def test_index_astype_empty_index(from_type, to_type): native_index = native_pd.Index([], dtype=from_type) snow_index = pd.Index(native_index) with SqlCounter(query_count=1): + # exact=False is used because of a discrepancy in the "inferred_type" attribute + # when to_type is bool between Snowpark pandas (empty) and native pandas (bool). assert_index_equal( snow_index.astype(to_type), native_index.astype(to_type), exact=False ) @@ -168,8 +170,8 @@ def test_index_astype_bool_nan_none(): snow_index = pd.Index(native_index) with pytest.raises(AssertionError): assert_index_equal(snow_index.astype(bool), native_index.astype(bool)) - expected_result = native_pd.Index([True, True, True, None, None], dtype=bool) - assert_index_equal(snow_index.astype(bool), expected_result, exact=False) + expected_result = native_pd.Index([True, True, True, False, False], dtype=bool) + assert_index_equal(snow_index.astype(bool), expected_result) # Another case where this arises is when a float Index with "None" in it is used. pandas # converts None to NaN during Index creation and thus leads to this difference. @@ -182,7 +184,7 @@ def test_index_astype_bool_nan_none(): expected_result = native_pd.Index( [True, True, True, True, False, False], dtype=bool ) - assert_index_equal(snow_index.astype(bool), expected_result, exact=False) + assert_index_equal(snow_index.astype(bool), expected_result) @sql_count_checker(query_count=2) From 4f2784fa9e0f7be710988a035997c5feb502f543 Mon Sep 17 00:00:00 2001 From: Hazem Elmeleegy Date: Fri, 25 Oct 2024 13:56:46 -0700 Subject: [PATCH 5/7] fix errors --- src/snowflake/snowpark/modin/plugin/_internal/type_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/type_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/type_utils.py index 404e5ffd76a..90f470d44f4 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/type_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/type_utils.py @@ -306,6 +306,10 @@ def column_astype( if to_dtype == np.object_: return to_variant(curr_col) if from_sf_type == to_sf_type: + if isinstance(to_sf_type, BooleanType): + new_col = to_variant(curr_col) + # treat NULL values in boolean columns as False to match pandas behavior + return iff(curr_col.is_null(), False, curr_col) return curr_col if isinstance(to_sf_type, _IntegralType) and "int64" not in str(to_dtype).lower(): @@ -387,7 +391,7 @@ def column_astype( # astype should not have any effect on NULL values except when casting to boolean if isinstance(to_sf_type, BooleanType): # treat NULL values in boolean columns as False to match pandas behavior - return iff(curr_col.is_null(), pandas_lit(False), new_col) + return iff(curr_col.is_null(), False, new_col) else: return iff(curr_col.is_null(), None, new_col) From 2207d65fa30b112de90199d9ca04dd4869d301df Mon Sep 17 00:00:00 2001 From: Hazem Elmeleegy Date: Fri, 25 Oct 2024 14:48:29 -0700 Subject: [PATCH 6/7] fix errors --- src/snowflake/snowpark/modin/plugin/docstrings/series.py | 6 +++--- src/snowflake/snowpark/modin/plugin/extensions/index.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series.py b/src/snowflake/snowpark/modin/plugin/docstrings/series.py index 37169ede461..db96b18d4de 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/series.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/series.py @@ -3426,14 +3426,14 @@ def to_numpy(): >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) >>> ser.to_numpy(dtype=object) array([Timestamp('2000-01-01 00:00:00+0100', tz='UTC+01:00'), - Timestamp('2000-01-02 00:00:00+0100', tz='UTC+01:00')], - dtype=object) + Timestamp('2000-01-02 00:00:00+0100', tz='UTC+01:00')], + dtype=object) Or dtype='datetime64[ns]' to return an ndarray of native datetime64 values. The values are converted to UTC and the timezone info is dropped. >>> ser.to_numpy(dtype="datetime64[ns]") array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'], - dtype='datetime64[ns]') + dtype='datetime64[ns]') """ tolist = to_list diff --git a/src/snowflake/snowpark/modin/plugin/extensions/index.py b/src/snowflake/snowpark/modin/plugin/extensions/index.py index b7512b34e61..e2e5d0c726c 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/index.py @@ -2151,14 +2151,14 @@ def to_numpy( >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) >>> ser.to_numpy(dtype=object) array([Timestamp('2000-01-01 00:00:00+0100', tz='UTC+01:00'), - Timestamp('2000-01-02 00:00:00+0100', tz='UTC+01:00')], - dtype=object) + Timestamp('2000-01-02 00:00:00+0100', tz='UTC+01:00')], + dtype=object) Or dtype='datetime64[ns]' to return an ndarray of native datetime64 values. The values are converted to UTC and the timezone info is dropped. >>> ser.to_numpy(dtype="datetime64[ns]") array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'], - dtype='datetime64[ns]') + dtype='datetime64[ns]') """ if copy: WarningMessage.ignored_argument( From 547e42371e9e431c64d20562ade647360c9d2fbc Mon Sep 17 00:00:00 2001 From: Hazem Elmeleegy Date: Fri, 25 Oct 2024 19:15:07 -0700 Subject: [PATCH 7/7] fix errors --- tests/integ/modin/test_to_numpy.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/integ/modin/test_to_numpy.py b/tests/integ/modin/test_to_numpy.py index 0e61f595e4e..c6745b8b530 100644 --- a/tests/integ/modin/test_to_numpy.py +++ b/tests/integ/modin/test_to_numpy.py @@ -146,7 +146,7 @@ def test_variant_data_to_numpy(pandas_obj): @sql_count_checker(query_count=1) -def test_to_numpy_copy_true(caplog): +def test_to_numpy_copy_true_series(caplog): series = pd.Series([1]) caplog.clear() @@ -156,6 +156,17 @@ def test_to_numpy_copy_true(caplog): assert "has been ignored by Snowpark pandas" in caplog.text +@sql_count_checker(query_count=1) +def test_to_numpy_copy_true_index(caplog): + idx = pd.Index([1]) + + caplog.clear() + WarningMessage.printed_warnings.clear() + with caplog.at_level(logging.WARNING): + assert_array_equal(idx.to_numpy(copy=True), native_pd.Index([1]).to_numpy()) + assert "has been ignored by Snowpark pandas" in caplog.text + + @sql_count_checker(query_count=1) def test_to_numpy_warning(caplog): series = pd.Series([1])