GH-41692: [Python] Improve substrait extended expressions support (#4…

…1693) Addresses some missing features and usability issues when using PyArrow with Substrait ExtendedExpressions * GitHub Issue: #41692 - [x] Allow passing `BoundExpressions` for `Scanner(columns=X)` instead of a dict of expressions. - [x] Allow passing `BoundExpressions` for `Scanner(filter=X)` so that user doesn't have to distinguish between `Expression` and `BoundExpressions` and can always just use `pyarrow.substrait.deserialize_expressions` - [x] Allow decoding `pyarrow.BoundExpressions` directly from `protobuf.Message`, thus allowing to use substrait-python objects. - [x] Return `memoryview` from methods encoding substrait, so that those can be directly passed to substrait-python (or more in general other python libraries) without a copy being involved. - [x] Allow decoding messages from `memoryview` so that the output of encoding functions can be sent back to dencoding functions. - [x] Allow to encode and decode schemas from substrait - [x] When encoding schemas return the extension types required for a substrait consumer to decode the schema - [x] Handle arrow extension types when decoding a schema - [x] Update docstrings and documentation --------- Co-authored-by: Raúl Cumplido <raulcumplido@gmail.com>
apache · Oct 16, 2024 · e0ab40d · e0ab40d
1 parent bb900c1
commit e0ab40d
Show file tree

Hide file tree

Showing 12 changed files with 521 additions and 11 deletions.
diff --git a/cpp/src/arrow/engine/substrait/serde.cc b/cpp/src/arrow/engine/substrait/serde.cc
@@ -56,7 +56,7 @@ Status ParseFromBufferImpl(const Buffer& buf, const std::string& full_name,
   if (message->ParseFromZeroCopyStream(&buf_stream)) {
     return Status::OK();
   }
-  return Status::IOError("ParseFromZeroCopyStream failed for ", full_name);
+  return Status::Invalid("ParseFromZeroCopyStream failed for ", full_name);
 }
 
 template <typename Message>

diff --git a/docs/source/python/api/substrait.rst b/docs/source/python/api/substrait.rst
@@ -43,6 +43,9 @@ compute expressions.
    BoundExpressions
    deserialize_expressions
    serialize_expressions
+   serialize_schema
+   deserialize_schema
+   SubstraitSchema
 
 Utility
 -------

diff --git a/docs/source/python/integration.rst b/docs/source/python/integration.rst
@@ -34,6 +34,7 @@ This allows to easily integrate PyArrow with other languages and technologies.
 .. toctree::
    :maxdepth: 2
 
+   integration/substrait
    integration/python_r
    integration/python_java
    integration/extending

diff --git a/docs/source/python/integration/substrait.rst b/docs/source/python/integration/substrait.rst
@@ -0,0 +1,249 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+=========
+Substrait
+=========
+
+The ``arrow-substrait`` module implements support for the Substrait_ format,
+enabling conversion to and from Arrow objects.
+
+The ``arrow-dataset`` module can execute Substrait_ plans via the
+:doc:`Acero <../cpp/streaming_execution>` query engine.
+
+.. contents::
+
+Working with Schemas
+====================
+
+Arrow schemas can be encoded and decoded using the :meth:`pyarrow.substrait.serialize_schema` and
+:meth:`pyarrow.substrait.deserialize_schema` functions.
+
+.. code-block:: python
+
+    import pyarrow as pa
+    import pyarrow.substrait as pa_substrait
+
+    arrow_schema = pa.schema([
+        pa.field("x", pa.int32()),
+        pa.field("y", pa.string())
+    ])
+    substrait_schema = pa_substrait.serialize_schema(arrow_schema)
+
+The schema marshalled as a Substrait ``NamedStruct`` is directly
+available as ``substrait_schema.schema``::
+
+    >>> print(substrait_schema.schema)
+    b'\n\x01x\n\x01y\x12\x0c\n\x04*\x02\x10\x01\n\x04b\x02\x10\x01'
+
+In case arrow custom types were used, the schema will require
+extensions for those types to be actually usable, for this reason
+the schema is also available as an `Extended Expression`_ including
+all the extensions types::
+
+    >>> print(substrait_schema.expression)
+    b'"\x14\n\x01x\n\x01y\x12\x0c\n\x04*\x02\x10\x01\n\x04b\x02\x10\x01:\x19\x10,*\x15Acero 17.0.0'
+
+If ``Substrait Python`` is installed, the schema can also be converted to
+a ``substrait-python`` object::
+
+    >>> print(substrait_schema.to_pysubstrait())
+    version {
+        minor_number: 44
+        producer: "Acero 17.0.0"
+    }
+    base_schema {
+        names: "x"
+        names: "y"
+        struct {
+            types {
+                i32 {
+                    nullability: NULLABILITY_NULLABLE
+                }
+            }
+            types {
+                string {
+                    nullability: NULLABILITY_NULLABLE
+                }
+            }
+        }
+    }
+
+Working with Expressions
+========================
+
+Arrow compute expressions can be encoded and decoded using the
+:meth:`pyarrow.substrait.serialize_expressions` and
+:meth:`pyarrow.substrait.deserialize_expressions` functions.
+
+.. code-block:: python
+
+    import pyarrow as pa
+    import pyarrow.compute as pa
+    import pyarrow.substrait as pa_substrait
+
+    arrow_schema = pa.schema([
+        pa.field("x", pa.int32()),
+        pa.field("y", pa.int32())
+    ])
+
+    substrait_expr = pa_substrait.serialize_expressions(
+        exprs=[pc.field("x") + pc.field("y")],
+        names=["total"],
+        schema=arrow_schema
+    )
+
+The result of encoding to substrait an expression will be the
+protobuf ``ExtendedExpression`` message data itself::
+
+    >>> print(bytes(substrait_expr))
+    b'\nZ\x12Xhttps://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml\x12\x07\x1a\x05\x1a\x03add\x1a>\n5\x1a3\x1a\x04*\x02\x10\x01"\n\x1a\x08\x12\x06\n\x02\x12\x00"\x00"\x0c\x1a\n\x12\x08\n\x04\x12\x02\x08\x01"\x00*\x11\n\x08overflow\x12\x05ERROR\x1a\x05total"\x14\n\x01x\n\x01y\x12\x0c\n\x04*\x02\x10\x01\n\x04*\x02\x10\x01:\x19\x10,*\x15Acero 17.0.0'
+
+So in case a ``Substrait Python`` object is required, the expression
+has to be decoded from ``substrait-python`` itself::
+
+    >>> import substrait
+    >>> pysubstrait_expr = substrait.proto.ExtendedExpression.FromString(substrait_expr)
+    >>> print(pysubstrait_expr)
+    version {
+      minor_number: 44
+      producer: "Acero 17.0.0"
+    }
+    extension_uris {
+      uri: "https://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml"
+    }
+    extensions {
+      extension_function {
+        name: "add"
+      }
+    }
+    referred_expr {
+      expression {
+        scalar_function {
+          arguments {
+            value {
+              selection {
+                direct_reference {
+                  struct_field {
+                  }
+                }
+                root_reference {
+                }
+              }
+            }
+          }
+          arguments {
+            value {
+              selection {
+                direct_reference {
+                  struct_field {
+                    field: 1
+                  }
+                }
+                root_reference {
+                }
+              }
+            }
+          }
+          options {
+            name: "overflow"
+            preference: "ERROR"
+          }
+          output_type {
+            i32 {
+              nullability: NULLABILITY_NULLABLE
+            }
+          }
+        }
+      }
+      output_names: "total"
+    }
+    base_schema {
+      names: "x"
+      names: "y"
+      struct {
+        types {
+          i32 {
+            nullability: NULLABILITY_NULLABLE
+          }
+        }
+        types {
+          i32 {
+            nullability: NULLABILITY_NULLABLE
+          }
+        }
+      }
+    }
+
+Executing Queries Using Substrait Extended Expressions
+======================================================
+
+Dataset supports executing queries using Substrait's `Extended Expression`_,
+the expressions can be passed to the dataset scanner in the form of
+:class:`pyarrow.substrait.BoundExpressions`
+
+.. code-block:: python
+
+    import pyarrow.dataset as ds
+    import pyarrow.substrait as pa_substrait
+
+    # Use substrait-python to create the queries
+    from substrait import proto
+
+    dataset = ds.dataset("./data/index-0.parquet")
+    substrait_schema = pa_substrait.serialize_schema(dataset.schema).to_pysubstrait()
+
+    # SELECT project_name FROM dataset WHERE project_name = 'pyarrow'
+
+    projection = proto.ExtendedExpression(referred_expr=[
+        {"expression": {"selection": {"direct_reference": {"struct_field": {"field": 0}}}},
+        "output_names": ["project_name"]}
+    ])
+    projection.MergeFrom(substrait_schema)
+
+    filtering = proto.ExtendedExpression(
+            extension_uris=[{"extension_uri_anchor": 99, "uri": "/functions_comparison.yaml"}],
+            extensions=[{"extension_function": {"extension_uri_reference": 99, "function_anchor": 199, "name": "equal:any1_any1"}}],
+            referred_expr=[
+                {"expression": {"scalar_function": {"function_reference": 199, "arguments": [
+                    {"value": {"selection": {"direct_reference": {"struct_field": {"field": 0}}}}},
+                    {"value": {"literal": {"string": "pyarrow"}}}
+                ], "output_type": {"bool": {"nullability": False}}}}}
+            ]
+    )
+    filtering.MergeFrom(substrait_schema)
+
+    results = dataset.scanner(
+        columns=pa.substrait.BoundExpressions.from_substrait(projection),
+        filter=pa.substrait.BoundExpressions.from_substrait(filtering)
+    ).head(5)
+
+
+.. code-block:: text
+
+    project_name
+    0      pyarrow
+    1      pyarrow
+    2      pyarrow
+    3      pyarrow
+    4      pyarrow
+
+
+.. _`Substrait`: https://substrait.io/
+.. _`Substrait Python`: https://github.com/substrait-io/substrait-python
+.. _`Acero`: https://arrow.apache.org/docs/cpp/streaming_execution.html
+.. _`Extended Expression`: https://github.com/substrait-io/substrait/blob/main/site/docs/expressions/extended_expression.md
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
@@ -2441,7 +2441,7 @@ cdef class Expression(_Weakrefable):
         )
 
     @staticmethod
-    def from_substrait(object buffer not None):
+    def from_substrait(object message not None):
         """
         Deserialize an expression from Substrait
 
@@ -2453,15 +2453,15 @@ cdef class Expression(_Weakrefable):
 
         Parameters
         ----------
-        buffer : bytes or Buffer
+        message : bytes or Buffer or a protobuf Message
             The Substrait message to deserialize
 
         Returns
         -------
         Expression
             The deserialized expression
         """
-        expressions = _pas().deserialize_expressions(buffer).expressions
+        expressions = _pas().BoundExpressions.from_substrait(message).expressions
         if len(expressions) == 0:
             raise ValueError("Substrait message did not contain any expressions")
         if len(expressions) > 1:

diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
@@ -39,6 +39,11 @@ from pyarrow.util import _is_iterable, _is_path_like, _stringify_path
 from pyarrow._json cimport ParseOptions as JsonParseOptions
 from pyarrow._json cimport ReadOptions as JsonReadOptions
 
+try:
+    import pyarrow.substrait as pa_substrait
+except ImportError:
+    pa_substrait = None
+
 
 _DEFAULT_BATCH_SIZE = 2**17
 _DEFAULT_BATCH_READAHEAD = 16
@@ -272,6 +277,13 @@ cdef class Dataset(_Weakrefable):
 
         # at the moment only support filter
         requested_filter = options.get("filter")
+        if pa_substrait and isinstance(requested_filter, pa_substrait.BoundExpressions):
+            expressions = list(requested_filter.expressions.values())
+            if len(expressions) != 1:
+                raise ValueError(
+                    "Only one BoundExpressions with a single expression are supported")
+            new_options["filter"] = requested_filter = expressions[0]
+
         current_filter = self._scan_options.get("filter")
         if requested_filter is not None and current_filter is not None:
             new_options["filter"] = current_filter & requested_filter
@@ -282,7 +294,7 @@ cdef class Dataset(_Weakrefable):
 
     def scanner(self,
                 object columns=None,
-                Expression filter=None,
+                object filter=None,
                 int batch_size=_DEFAULT_BATCH_SIZE,
                 int batch_readahead=_DEFAULT_BATCH_READAHEAD,
                 int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD,
@@ -3447,6 +3459,9 @@ cdef void _populate_builder(const shared_ptr[CScannerBuilder]& ptr,
         filter, pyarrow_wrap_schema(builder.schema()))))
 
     if columns is not None:
+        if pa_substrait and isinstance(columns, pa_substrait.BoundExpressions):
+            columns = columns.expressions
+
         if isinstance(columns, dict):
             for expr in columns.values():
                 if not isinstance(expr, Expression):
@@ -3527,7 +3542,7 @@ cdef class Scanner(_Weakrefable):
     @staticmethod
     def from_dataset(Dataset dataset not None, *,
                      object columns=None,
-                     Expression filter=None,
+                     object filter=None,
                      int batch_size=_DEFAULT_BATCH_SIZE,
                      int batch_readahead=_DEFAULT_BATCH_READAHEAD,
                      int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD,