Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SNOW-1518378 - Provide a numpy compatibility mapping to np.full_like #2499

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@
- Added support for applying Snowpark Python function `snowflake_cortex_summarize`.
- Added support for `DataFrame.attrs` and `Series.attrs`.
- Added support for `DataFrame.style`.
- Added support for `Index.to_numpy`.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are probably merge artifacts?

- Added support for `DataFrame.align` and `Series.align` for `axis=0`.
- Added numpy compatibility support for `np.full_like`

#### Improvements

Expand Down
3 changes: 3 additions & 0 deletions docs/source/modin/numpy.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ NumPy ufuncs called with Snowpark pandas arguments will ignore kwargs.
| | dispatcher at all, and the normal NumPy behavior |
| | will occur.) |
+-----------------------------+----------------------------------------------------+
| ``np.full_like`` | Mapped to pd.DataFrame(value, index=range(height), |
| | columns=range(width)) |
+-----------------------------+----------------------------------------------------+
| ``np.may_share_memory`` | Returns False |
+-----------------------------+----------------------------------------------------+
| ``np.add`` | Mapped to df.__add__(df2) |
Expand Down
33 changes: 32 additions & 1 deletion src/snowflake/snowpark/modin/plugin/utils/numpy_to_pandas.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#
# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
#
from typing import Any, Optional, Union
from typing import Any, Hashable, Optional, Union

import modin.pandas as pd
from modin.pandas.base import BasePandasDataset
Expand Down Expand Up @@ -112,6 +112,36 @@ def may_share_memory_mapper(a: Any, b: Any, max_work: Optional[int] = None) -> b
return False


def full_like_mapper(
a: Union[pd.DataFrame, pd.Series],
fill_value: Hashable,
dtype: Optional[Any] = None,
order: Optional[str] = "K",
subok: Optional[bool] = True,
shape: Optional[tuple[Any]] = None,
) -> Union[pd.DataFrame, pd.Series]:
if not subok:
return NotImplemented
if not order == "K":
return NotImplemented
if dtype is not None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why can't we support dtype here? It just overrides the datatype provided by a right?

return NotImplemented

result_shape = shape
if isinstance(result_shape, tuple) and len(result_shape) == 0:
sfc-gh-jkew marked this conversation as resolved.
Show resolved Hide resolved
result_shape = (1,)
if isinstance(result_shape, int):
result_shape = (result_shape,)
if result_shape is None:
result_shape = a.shape
if len(result_shape) == 2:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe check if result_shape is list-like? otherwise something like result_shape = 'ab' would enter this conditional?

height, width = result_shape # type: ignore
return pd.DataFrame(fill_value, index=range(height), columns=range(width))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we need to set dtype here? Also if a DataFrame contains multiple dtypes, are the returned object's columns supposed to be the same type as each of the columns mapped positionally?

if len(result_shape) == 1:
return pd.Series(fill_value, index=range(result_shape[0]))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dtype?

return NotImplemented


# We also need to convert everything to booleans, since numpy will
# do this implicitly on logical operators and pandas does not.
def map_to_bools(inputs: Any) -> Any:
Expand All @@ -125,6 +155,7 @@ def map_to_bools(inputs: Any) -> Any:
numpy_to_pandas_func_map = {
"where": where_mapper,
"may_share_memory": may_share_memory_mapper,
"full_like": full_like_mapper,
}

# Map that associates a numpy universal function name that operates on
Expand Down
57 changes: 57 additions & 0 deletions tests/integ/modin/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,63 @@ def test_np_may_share_memory():
assert not np.may_share_memory(snow_df_A, native_df_A)


def test_full_like():
data = {
"A": [0, 1, 2, 0, 1, 2, 0, 1, 2],
"B": [True, False, True, True, False, True, False, False, False],
"C": ["a", "b", "c", "d", "a", "b", "c", "d", "e"],
}
snow_df = pd.DataFrame(data)
pandas_df = native_pd.DataFrame(data)

with SqlCounter(query_count=2):
snow_result = np.full_like(snow_df, 1234)
pandas_result = np.full_like(pandas_df, 1234)
assert_array_equal(np.array(snow_result), np.array(pandas_result))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we be checking if the dataframes are equal?


with SqlCounter(query_count=1):
snow_result = np.full_like(snow_df, 1234, shape=(5, 3))
pandas_result = np.full_like(pandas_df, 1234, shape=(5, 3))
assert_array_equal(np.array(snow_result), np.array(pandas_result))

with SqlCounter(query_count=2):
snow_result = np.full_like(snow_df["A"], 1234)
pandas_result = np.full_like(pandas_df["A"], 1234)
assert_array_equal(np.array(snow_result), np.array(pandas_result))

with SqlCounter(query_count=1):
snow_result = np.full_like(snow_df, "numpy is the best")
pandas_result = np.full_like(pandas_df, "numpy is the best")
assert_array_equal(np.array(snow_result), np.array(pandas_result))

with SqlCounter(query_count=1):
pandas_result = np.full_like(pandas_df, fill_value=4, shape=())
snow_result = np.full_like(snow_df, fill_value=4, shape=())
assert_array_equal(np.array(snow_result), np.array(pandas_result))

with SqlCounter(query_count=1):
snow_result = np.full_like(snow_df, fill_value=4, shape=4)
pandas_result = np.full_like(pandas_df, fill_value=4, shape=4)
assert_array_equal(np.array(snow_result), np.array(pandas_result))

with SqlCounter(query_count=1):
snow_result = np.full_like(snow_df, fill_value=4, shape=(4,))
pandas_result = np.full_like(pandas_df, fill_value=4, shape=(4,))
assert_array_equal(np.array(snow_result), np.array(pandas_result))

with pytest.raises(TypeError):
np.full_like(snow_df, 1234, shape=[])

with pytest.raises(TypeError):
np.full_like(snow_df, 1234, subok=False)

with pytest.raises(TypeError):
np.full_like(snow_df, 1234, order="D")

with pytest.raises(TypeError):
np.full_like(snow_df, 1234, dtype=int)
sfc-gh-jkew marked this conversation as resolved.
Show resolved Hide resolved


def test_logical_operators():
data = {
"A": [0, 1, 2, 0, 1, 2, 0, 1, 2],
Expand Down
Loading