From c3cf7c6079a7c44d98bb8a2d0afd16785a37fd40 Mon Sep 17 00:00:00 2001 From: John Kew Date: Wed, 23 Oct 2024 16:55:42 -0700 Subject: [PATCH] Provide a numpy compatibility mapping to np.full_like --- CHANGELOG.md | 1 + docs/source/modin/numpy.rst | 3 ++ .../modin/plugin/utils/numpy_to_pandas.py | 27 ++++++++++++- tests/integ/modin/test_numpy.py | 39 +++++++++++++++++++ 4 files changed, 69 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ce7fee72f..c3b3db51fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,7 @@ - Added support for timedelta inputs in `value_counts()`. - Added support for applying Snowpark Python function `snowflake_cortex_summarize`. - Added support for `DataFrame`/`Series.attrs` +- Added numpy compatibility support for `np.full_like` #### Improvements diff --git a/docs/source/modin/numpy.rst b/docs/source/modin/numpy.rst index fa9c650aba..83d24bdd63 100644 --- a/docs/source/modin/numpy.rst +++ b/docs/source/modin/numpy.rst @@ -25,6 +25,9 @@ NumPy ufuncs called with Snowpark pandas arguments will ignore kwargs. | | dispatcher at all, and the normal NumPy behavior | | | will occur.) | +-----------------------------+----------------------------------------------------+ +| ``np.full_like`` | Mapped to pd.DataFrame(value, index=range(height), | +| | columns=range(width)) | ++-----------------------------+----------------------------------------------------+ | ``np.may_share_memory`` | Returns False | +-----------------------------+----------------------------------------------------+ | ``np.add`` | Mapped to df.__add__(df2) | diff --git a/src/snowflake/snowpark/modin/plugin/utils/numpy_to_pandas.py b/src/snowflake/snowpark/modin/plugin/utils/numpy_to_pandas.py index c751b4fe55..4fa1b58bab 100644 --- a/src/snowflake/snowpark/modin/plugin/utils/numpy_to_pandas.py +++ b/src/snowflake/snowpark/modin/plugin/utils/numpy_to_pandas.py @@ -1,7 +1,7 @@ # # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. # -from typing import Any, Optional, Union +from typing import Any, Hashable, Optional, Union import modin.pandas as pd from modin.pandas.base import BasePandasDataset @@ -112,6 +112,30 @@ def may_share_memory_mapper(a: Any, b: Any, max_work: Optional[int] = None) -> b return False +def full_like_mapper( + a: Union[pd.DataFrame, pd.Series], + fill_value: Hashable, + dtype: Optional[Any] = None, + order: Optional[str] = "K", + subok: Optional[bool] = True, + shape: Optional[tuple[Any]] = None, +) -> Union[pd.DataFrame, pd.Series]: + if not subok: + return NotImplemented + if not order == "K": + return NotImplemented + if dtype is not None: + return NotImplemented + + result_shape = shape or a.shape + if len(result_shape) == 2: + height, width = result_shape # type: ignore + return pd.DataFrame(fill_value, index=range(height), columns=range(width)) + if len(result_shape) == 1: + return pd.Series(fill_value, index=range(result_shape[0])) + return NotImplemented + + # We also need to convert everything to booleans, since numpy will # do this implicitly on logical operators and pandas does not. def map_to_bools(inputs: Any) -> Any: @@ -125,6 +149,7 @@ def map_to_bools(inputs: Any) -> Any: numpy_to_pandas_func_map = { "where": where_mapper, "may_share_memory": may_share_memory_mapper, + "full_like": full_like_mapper, } # Map that associates a numpy universal function name that operates on diff --git a/tests/integ/modin/test_numpy.py b/tests/integ/modin/test_numpy.py index 6719055ab9..b437266561 100644 --- a/tests/integ/modin/test_numpy.py +++ b/tests/integ/modin/test_numpy.py @@ -57,6 +57,45 @@ def test_np_may_share_memory(): assert not np.may_share_memory(snow_df_A, native_df_A) +def test_full_like(): + data = { + "A": [0, 1, 2, 0, 1, 2, 0, 1, 2], + "B": [True, False, True, True, False, True, False, False, False], + "C": ["a", "b", "c", "d", "a", "b", "c", "d", "e"], + } + snow_df = pd.DataFrame(data) + pandas_df = native_pd.DataFrame(data) + + with SqlCounter(query_count=2): + snow_result = np.full_like(snow_df, 1234) + pandas_result = np.full_like(pandas_df, 1234) + assert_array_equal(np.array(snow_result), np.array(pandas_result)) + + with SqlCounter(query_count=1): + snow_result = np.full_like(snow_df, 1234, shape=(5, 3)) + pandas_result = np.full_like(pandas_df, 1234, shape=(5, 3)) + assert_array_equal(np.array(snow_result), np.array(pandas_result)) + + with SqlCounter(query_count=2): + snow_result = np.full_like(snow_df["A"], 1234) + pandas_result = np.full_like(pandas_df["A"], 1234) + assert_array_equal(np.array(snow_result), np.array(pandas_result)) + + with SqlCounter(query_count=1): + snow_result = np.full_like(snow_df, "numpy is the best") + pandas_result = np.full_like(pandas_df, "numpy is the best") + assert_array_equal(np.array(snow_result), np.array(pandas_result)) + + with pytest.raises(TypeError): + np.full_like(snow_df, 1234, subok=False) + + with pytest.raises(TypeError): + np.full_like(snow_df, 1234, order="D") + + with pytest.raises(TypeError): + np.full_like(snow_df, 1234, dtype=int) + + def test_logical_operators(): data = { "A": [0, 1, 2, 0, 1, 2, 0, 1, 2],