Skip to content

Commit

Permalink
SNOW-1748104,SNOW-1748107, SNOW-1748108: Add support for pd.read_pick…
Browse files Browse the repository at this point in the history
…le/read_html/read_xml
  • Loading branch information
sfc-gh-helmeleegy committed Oct 30, 2024
1 parent 5a0aca2 commit 60f6a33
Show file tree
Hide file tree
Showing 10 changed files with 524 additions and 34 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
- Added support for `Index.to_numpy`.
- Added support for `DataFrame.align` and `Series.align` for `axis=0`.
- Added support for `size` in `GroupBy.aggregate`, `DataFrame.aggregate`, and `Series.aggregate`.
- Added support for `pd.read_pickle` (Uses native pandas for processing).
- Added support for `pd.read_html` (Uses native pandas for processing).
- Added support for `pd.read_xml` (Uses native pandas for processing).

### Snowpark Local Testing Updates

Expand Down
3 changes: 3 additions & 0 deletions docs/source/modin/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ Input/Output
read_json
read_parquet
read_sas
read_pickle
read_html
read_xml

.. rubric:: SQL

Expand Down
6 changes: 6 additions & 0 deletions docs/source/modin/supported/general_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ Data manipulations
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``qcut`` | P | | ``N`` if ``labels!=False`` or ``retbins=True``. |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``read_pickle`` | Y | | Uses native pandas for reading. |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``read_csv`` | P | | Reads both local and staged file(s) into a Snowpark|
| | | | pandas DataFrame. Note, the order of rows in the |
| | | | may differ from the order of rows in the original |
Expand Down Expand Up @@ -84,6 +86,10 @@ Data manipulations
| | | ``dtype_backend``, and | |
| | | ``storage_options`` are ignored. | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``read_html`` | Y | | Uses native pandas for reading. |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``read_xml`` | Y | | Uses native pandas for reading. |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``read_parquet`` | P | ``use_nullable_dtypes``, | Supported parameter(s) are: ``columns`` |
| | | ``filesystem``, and ``filters`` | |
| | | will raise an error if used. | |
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
"pytest-assume", # sql counter check
"decorator", # sql counter check
"protoc-wheel-0", # Protocol buffer compiler, for Snowpark IR
"lxml", # used in read_xml tests
]

# read the version
Expand Down
386 changes: 384 additions & 2 deletions src/snowflake/snowpark/modin/plugin/extensions/io_overrides.py

Large diffs are not rendered by default.

46 changes: 17 additions & 29 deletions src/snowflake/snowpark/modin/plugin/io/snow_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,28 +593,18 @@ def read_gbq(
pass # pragma: no cover

@classmethod
@pandas_module_level_function_not_implemented()
def read_html(
cls,
io,
*,
match=".+",
flavor=None,
header=None,
index_col=None,
skiprows=None,
attrs=None,
parse_dates=False,
thousands=",",
encoding=None,
decimal=".",
converters=None,
na_values=None,
keep_default_na=True,
displayed_only=True,
**kwargs,
):
pass # pragma: no cover
def read_html(cls, **kwargs) -> list[SnowflakeQueryCompiler]:
"""
Read HTML tables into a list of query compilers.
"""
return [cls.from_pandas(df) for df in pandas.read_html(**kwargs)]

@classmethod
def read_xml(cls, **kwargs) -> SnowflakeQueryCompiler:
"""
Read XML document into a query compiler.
"""
return cls.from_pandas(pandas.read_xml(**kwargs))

@classmethod
@pandas_module_level_function_not_implemented()
Expand Down Expand Up @@ -665,13 +655,11 @@ def read_sas(cls, **kwargs): # noqa: PR01
return cls.from_pandas(pandas.read_sas(**kwargs))

@classmethod
@pandas_module_level_function_not_implemented()
def read_pickle(
cls,
filepath_or_buffer,
**kwargs,
):
pass # pragma: no cover
def read_pickle(cls, **kwargs) -> SnowflakeQueryCompiler:
"""
Load pickled pandas object (or any object) from file into a query compiler.
"""
return cls.from_pandas(pandas.read_pickle(**kwargs))

@classmethod
@pandas_module_level_function_not_implemented()
Expand Down
43 changes: 43 additions & 0 deletions tests/integ/modin/io/test_read_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#
# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
#
import os
import uuid

import modin.pandas as pd
import pandas as native_pd

from tests.integ.modin.utils import assert_frame_equal
from tests.integ.utils.sql_counter import SqlCounter


def test_read_html():
html_str = """
<table>
<tr>
<th>A</th>
<th colspan="1">B</th>
<th rowspan="1">C</th>
</tr>
<tr>
<td>a</td>
<td>b</td>
<td>c</td>
</tr>
</table>
"""
filename = f"test_read_html_{str(uuid.uuid4())}"

with open(filename, "w") as f:
f.write(html_str)

try:
with SqlCounter(query_count=1):
assert_frame_equal(
pd.read_html(filename)[0],
native_pd.read_html(filename)[0],
check_dtype=False,
)
finally:
if os.path.exists(filename):
os.remove(filename)
28 changes: 28 additions & 0 deletions tests/integ/modin/io/test_read_pickle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#
# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
#
import os
import uuid

import modin.pandas as pd
import pandas as native_pd

from tests.integ.modin.utils import assert_frame_equal
from tests.integ.utils.sql_counter import SqlCounter


def test_read_pickle():
df = native_pd.DataFrame({"foo": range(5), "bar": range(5, 10)})

filename = f"test_read_pickle_{str(uuid.uuid4())}"
try:
native_pd.to_pickle(df, filename)
with SqlCounter(query_count=1):
assert_frame_equal(
pd.read_pickle(filename),
native_pd.read_pickle(filename),
check_dtype=False,
)
finally:
if os.path.exists(filename):
os.remove(filename)
39 changes: 39 additions & 0 deletions tests/integ/modin/io/test_read_xml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#
# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
#
from io import StringIO

import modin.pandas as pd
import pandas as native_pd

from tests.integ.modin.utils import assert_frame_equal
from tests.integ.utils.sql_counter import SqlCounter


def test_read_xml():
xml = """<?xml version='1.0' encoding='utf-8'?>
<data xmlns="http://example.com">
<row>
<shape>square</shape>
<degrees>360</degrees>
<sides>4.0</sides>
</row>
<row>
<shape>circle</shape>
<degrees>360</degrees>
<sides/>
</row>
<row>
<shape>triangle</shape>
<degrees>180</degrees>
<sides>3.0</sides>
</row>
</data>
"""

with SqlCounter(query_count=1):
assert_frame_equal(
pd.read_xml(StringIO(xml)),
native_pd.read_xml(StringIO(xml)),
check_dtype=False,
)
3 changes: 0 additions & 3 deletions tests/unit/modin/test_unsupported.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,11 @@
@pytest.mark.parametrize(
"io_method, kwargs",
[
["read_xml", {"path_or_buffer": ""}],
["read_gbq", {"query": ""}],
["read_html", {"io": ""}],
["read_clipboard", {}],
["read_hdf", {"path_or_buf": ""}],
["read_feather", {"path": ""}],
["read_stata", {"filepath_or_buffer": ""}],
["read_pickle", {"filepath_or_buffer": ""}],
["read_sql", {"sql": "", "con": ""}],
["read_fwf", {"filepath_or_buffer": ""}],
["read_sql_table", {"table_name": "", "con": ""}],
Expand Down

0 comments on commit 60f6a33

Please sign in to comment.