Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SNOW-1707707: Add support for Index.to_numpy #2504

Merged
merged 9 commits into from
Oct 28, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
- Added support for applying Snowpark Python function `snowflake_cortex_summarize`.
- Added support for `DataFrame.attrs` and `Series.attrs`.
- Added support for `DataFrame.style`.
- Added support for `Index.to_numpy`.

#### Improvements

Expand Down
2 changes: 2 additions & 0 deletions docs/source/modin/supported/index_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ Methods
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``to_frame`` | Y | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``to_numpy`` | Y | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``view`` | N | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``argsort`` | N | | |
Expand Down
8 changes: 6 additions & 2 deletions src/snowflake/snowpark/modin/plugin/_internal/type_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,8 +384,12 @@ def column_astype(
new_col = cast(curr_col, LongType())
else:
new_col = cast(curr_col, to_sf_type)
# astype should not have any effect on NULL values
return iff(curr_col.is_null(), None, new_col)
# astype should not have any effect on NULL values except when casting to boolean
if isinstance(to_sf_type, BooleanType):
# treat NULL values in boolean columns as False to match pandas behavior
return iff(curr_col.is_null(), pandas_lit(False), new_col)
else:
return iff(curr_col.is_null(), None, new_col)


def is_astype_type_error(
Expand Down
16 changes: 14 additions & 2 deletions src/snowflake/snowpark/modin/plugin/_internal/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@
import pandas as native_pd
from pandas._typing import AnyArrayLike, Scalar
from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.common import is_integer_dtype, is_object_dtype, is_scalar
from pandas.core.dtypes.common import (
is_bool_dtype,
is_integer_dtype,
is_object_dtype,
is_scalar,
)
from pandas.core.dtypes.inference import is_list_like

import snowflake.snowpark.modin.plugin._internal.statement_params_constants as STATEMENT_PARAMS
Expand Down Expand Up @@ -1557,7 +1562,14 @@ def convert_str_to_timedelta(x: str) -> pd.Timedelta:
# example, an empty dataframe will be object dtype by default, or a variant, or a timestamp column with
# multiple timezones. So here we cast the index to the index_type when ret = pd.Index(...) above cannot
# figure out a non-object dtype. Note that the index_type is a logical type may not be 100% accurate.
if is_object_dtype(ret.dtype) and not is_object_dtype(index_type):
# We exclude the case where ret.dtype is object dtype while index_dtype is bool dtype. This is because
# casting None values to bool converts them to False, which results in a descripency with the pandas
# behavior.
if (
is_object_dtype(ret.dtype)
and not is_object_dtype(index_type)
and not is_bool_dtype(index_type)
):
# TODO: SNOW-1657460 fix index_type for timestamp_tz
try:
ret = ret.astype(index_type)
Expand Down
51 changes: 51 additions & 0 deletions src/snowflake/snowpark/modin/plugin/docstrings/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3383,6 +3383,57 @@ def to_numpy():
Returns
-------
numpy.ndarray

See Also
--------
Series.array
Get the actual data stored within.
Index.array
Get the actual data stored within.
DataFrame.to_numpy
Similar method for DataFrame.

Notes
-----
The returned array will be the same up to equality (values equal in self will be equal in the returned array; likewise for values that are not equal). When self contains an ExtensionArray, the dtype may be different. For example, for a category-dtype Series, to_numpy() will return a NumPy array and the categorical dtype will be lost.

This table lays out the different dtypes and default return types of to_numpy() for various dtypes within pandas.

+--------------------+----------------------------------+
| dtype | array type |
+--------------------+----------------------------------+
| category[T] | ndarray[T] (same dtype as input) |
+--------------------+----------------------------------+
| period | ndarray[object] (Periods) |
+--------------------+----------------------------------+
| interval | ndarray[object] (Intervals) |
+--------------------+----------------------------------+
| IntegerNA | ndarray[object] |
+--------------------+----------------------------------+
| datetime64[ns] | datetime64[ns] |
+--------------------+----------------------------------+
| datetime64[ns, tz] | ndarray[object] (Timestamps) |
+--------------------+----------------------------------+

Examples
--------
>>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) # doctest: +SKIP
>>> ser.to_numpy() # doctest: +SKIP
array(['a', 'b', 'a'], dtype=object)

Specify the dtype to control how datetime-aware data is represented. Use dtype=object to return an ndarray of pandas Timestamp objects, each with the correct tz.

>>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
>>> ser.to_numpy(dtype=object)
array([Timestamp('2000-01-01 00:00:00+0100', tz='UTC+01:00'),
Timestamp('2000-01-02 00:00:00+0100', tz='UTC+01:00')],
dtype=object)

Or dtype='datetime64[ns]' to return an ndarray of native datetime64 values. The values are converted to UTC and the timezone info is dropped.

>>> ser.to_numpy(dtype="datetime64[ns]")
array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
dtype='datetime64[ns]')
"""

tolist = to_list
Expand Down
103 changes: 102 additions & 1 deletion src/snowflake/snowpark/modin/plugin/extensions/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,13 @@

import modin
import numpy as np
import numpy.typing as npt
import pandas as native_pd
from modin.pandas import DataFrame, Series
from modin.pandas.base import BasePandasDataset
from pandas import get_option
from pandas._libs import lib
from pandas._libs.lib import is_list_like, is_scalar
from pandas._libs.lib import is_list_like, is_scalar, no_default
from pandas._typing import ArrayLike, DateTimeErrorChoices, DtypeObj, NaPosition, Scalar
from pandas.core.arrays import ExtensionArray
from pandas.core.dtypes.base import ExtensionDtype
Expand Down Expand Up @@ -2079,6 +2080,102 @@ def to_frame(

return DataFrame(query_compiler=new_qc)

def to_numpy(
self,
dtype: npt.DTypeLike | None = None,
copy: bool = False,
na_value: object = no_default,
**kwargs: Any,
) -> np.ndarray:
"""
A NumPy ndarray representing the values in this Series or Index.

Parameters
----------
dtype : str or numpy.dtype, optional
The dtype to pass to :meth:`numpy.asarray`.
copy : bool, default False
This argument is ignored in Snowflake backend. The data from Snowflake
will be retrieved into the client, and a numpy array containing this
data will be returned.
na_value : Any, optional
The value to use for missing values. The default value depends
on `dtype` and the type of the array.
**kwargs
Additional keywords passed through to the ``to_numpy`` method
of the underlying array (for extension arrays).

Returns
-------
numpy.ndarray

See Also
--------
Series.array
Get the actual data stored within.
Index.array
Get the actual data stored within.
DataFrame.to_numpy
Similar method for DataFrame.

Notes
-----
The returned array will be the same up to equality (values equal in self will be equal in the returned array; likewise for values that are not equal). When self contains an ExtensionArray, the dtype may be different. For example, for a category-dtype Series, to_numpy() will return a NumPy array and the categorical dtype will be lost.

This table lays out the different dtypes and default return types of to_numpy() for various dtypes within pandas.

+--------------------+----------------------------------+
| dtype | array type |
+--------------------+----------------------------------+
| category[T] | ndarray[T] (same dtype as input) |
+--------------------+----------------------------------+
| period | ndarray[object] (Periods) |
+--------------------+----------------------------------+
| interval | ndarray[object] (Intervals) |
+--------------------+----------------------------------+
| IntegerNA | ndarray[object] |
+--------------------+----------------------------------+
| datetime64[ns] | datetime64[ns] |
+--------------------+----------------------------------+
| datetime64[ns, tz] | ndarray[object] (Timestamps) |
+--------------------+----------------------------------+

Examples
--------
>>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) # doctest: +SKIP
>>> ser.to_numpy() # doctest: +SKIP
array(['a', 'b', 'a'], dtype=object)

Specify the dtype to control how datetime-aware data is represented. Use dtype=object to return an ndarray of pandas Timestamp objects, each with the correct tz.

>>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
>>> ser.to_numpy(dtype=object)
array([Timestamp('2000-01-01 00:00:00+0100', tz='UTC+01:00'),
Timestamp('2000-01-02 00:00:00+0100', tz='UTC+01:00')],
dtype=object)

Or dtype='datetime64[ns]' to return an ndarray of native datetime64 values. The values are converted to UTC and the timezone info is dropped.

>>> ser.to_numpy(dtype="datetime64[ns]")
array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
dtype='datetime64[ns]')
"""
if copy:
WarningMessage.ignored_argument(
operation="to_numpy",
argument="copy",
message="copy is ignored in Snowflake backend",
)
return (
self.to_pandas()
.to_numpy(
dtype=dtype,
na_value=na_value,
**kwargs,
)
.flatten()
)

@index_not_implemented()
def fillna(self) -> None:
"""
Expand Down Expand Up @@ -2601,6 +2698,10 @@ def __array__(self, dtype: Any = None) -> np.ndarray:
"""
The array interface, return the values.
"""
# Ensure that the existing index dtype is preserved in the returned array
# if no other dtype is given.
if dtype is None:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fixes a pre-existing issue that was causing some test failures with the introduction of Index.to_numpy.

dtype = self.dtype
return self.to_pandas().__array__(dtype=dtype)

def __repr__(self) -> str:
Expand Down
39 changes: 18 additions & 21 deletions tests/integ/modin/frame/test_getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,31 +36,28 @@
np.array([], dtype=bool),
],
)
@sql_count_checker(query_count=1, join_count=1)
def test_df_getitem_with_boolean_list_like(
key, default_index_snowpark_pandas_df, default_index_native_df
):
# one added query to convert to native pandas and 1 added query for series initialization
with SqlCounter(
query_count=3 if isinstance(key, native_pd.Index) else 1, join_count=1
):
# df[boolean list-like key] is the same as df.loc[:, boolean list-like key]
if isinstance(key, native_pd.Index):
Copy link
Contributor Author

@sfc-gh-helmeleegy sfc-gh-helmeleegy Oct 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There was no need to convert the native pandas index into a Snowpark pandas index when the df it's being passed to is actually a native pandas DataFrame. This was also causing test failures on the introduction of Index.to_numpy.

key = pd.Index(key)

def get_helper(df):
if isinstance(df, pd.DataFrame):
return df[key]
# If pandas df, adjust the length of the df and key since boolean keys need to be the same length as the axis.
_key = try_convert_index_to_native(key)
_df = df.iloc[: len(key)]
_key = _key[: _df.shape[1]]
return _df[_key]
# df[boolean list-like key] is the same as df.loc[:, boolean list-like key]

eval_snowpark_pandas_result(
default_index_snowpark_pandas_df,
default_index_native_df,
get_helper,
)
def get_helper(df, key):
if isinstance(df, pd.DataFrame):
if isinstance(key, native_pd.Index):
key = pd.Index(key)
return df[key]
# If pandas df, adjust the length of the df and key since boolean keys need to be the same length as the axis.
_key = try_convert_index_to_native(key)
_df = df.iloc[: len(key)]
_key = _key[: _df.shape[1]]
return _df[_key]

eval_snowpark_pandas_result(
default_index_snowpark_pandas_df,
default_index_native_df,
lambda df: get_helper(df, key),
)


@pytest.mark.parametrize(
Expand Down
6 changes: 5 additions & 1 deletion tests/integ/modin/index/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,11 @@ def test_index_astype_empty_index(from_type, to_type):
native_index = native_pd.Index([], dtype=from_type)
snow_index = pd.Index(native_index)
with SqlCounter(query_count=1):
assert_index_equal(snow_index.astype(to_type), native_index.astype(to_type))
# exact=False is used because of a discrepancy in the "inferred_type" attribute
# when to_type is bool between Snowpark pandas (empty) and native pandas (bool).
assert_index_equal(
snow_index.astype(to_type), native_index.astype(to_type), exact=False
)


@pytest.mark.parametrize(
Expand Down
7 changes: 5 additions & 2 deletions tests/integ/modin/test_to_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,15 @@
[datetime.datetime(2023, 1, 1), datetime.datetime(2023, 1, 1, 1, 2, 3), None],
],
)
@pytest.mark.parametrize("pandas_obj", ["DataFrame", "Series"])
@pytest.mark.parametrize("pandas_obj", ["DataFrame", "Series", "Index"])
@pytest.mark.parametrize("func", ["to_numpy", "values"])
def test_to_numpy_basic(data, pandas_obj, func):
if pandas_obj == "Series":
df = pd.Series(data)
native_df = native_pd.Series(data)
elif pandas_obj == "Index":
df = pd.Index(data)
native_df = native_pd.Index(data)
else:
df = pd.DataFrame([data, data])
native_df = native_pd.DataFrame([data, data])
Expand Down Expand Up @@ -109,7 +112,7 @@ def test_tz_aware_data_to_numpy(session):
assert_array_equal(df.to_numpy(), expected_result)


@pytest.mark.parametrize("pandas_obj", ["DataFrame", "Series"])
@pytest.mark.parametrize("pandas_obj", ["DataFrame", "Series", "Index"])
@sql_count_checker(query_count=1)
def test_variant_data_to_numpy(pandas_obj):
data = [
Expand Down
Loading