Skip to content

Commit 04f8241

Browse files
committed
Revert "UNPICK changes to review"
This reverts commit 83ac4d1.
1 parent 83ac4d1 commit 04f8241

File tree

23 files changed

+744
-179
lines changed

23 files changed

+744
-179
lines changed

docs/source/conf.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,9 @@ def autoapi_skip_member_fn(app, what, name, obj, skip, options) -> bool: # noqa
8383
# Duplicate modules (skip module-level docs to avoid duplication)
8484
("module", "datafusion.col"),
8585
("module", "datafusion.udf"),
86+
# Private variables causing duplicate documentation
87+
("data", "datafusion.utils._PYARROW_DATASET_TYPES"),
88+
("variable", "datafusion.utils._PYARROW_DATASET_TYPES"),
8689
# Deprecated
8790
("class", "datafusion.substrait.serde"),
8891
("class", "datafusion.substrait.plan"),
@@ -91,9 +94,28 @@ def autoapi_skip_member_fn(app, what, name, obj, skip, options) -> bool: # noqa
9194
("method", "datafusion.context.SessionContext.tables"),
9295
("method", "datafusion.dataframe.DataFrame.unnest_column"),
9396
]
97+
# Explicitly skip certain members listed above. These are either
98+
# re-exports, duplicate module-level documentation, deprecated
99+
# API surfaces, or private variables that would otherwise appear
100+
# in the generated docs and cause confusing duplication.
101+
# Keeping this explicit list avoids surprising entries in the
102+
# AutoAPI output and gives us a single place to opt-out items
103+
# when we intentionally hide them from the docs.
94104
if (what, name) in skip_contents:
95105
skip = True
96106

107+
# Skip private module-level names (those whose final component
108+
# starts with an underscore) when AutoAPI is rendering data or
109+
# variable entries. Many internal module-level constants are
110+
# implementation details (for example private pyarrow dataset type
111+
# mappings) that would otherwise be emitted as top-level "data"
112+
# or "variable" docs. Filtering them here avoids noisy,
113+
# duplicate, or implementation-specific entries in the public
114+
# documentation while still allowing public members and types to
115+
# be documented normally.
116+
if name.split(".")[-1].startswith("_") and what in ("data", "variable"):
117+
skip = True
118+
97119
return skip
98120

99121

docs/source/contributor-guide/ffi.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ as performant as possible and to utilize the features of DataFusion, you may dec
3434
your source in Rust and then expose it through `PyO3 <https://pyo3.rs>`_ as a Python library.
3535

3636
At first glance, it may appear the best way to do this is to add the ``datafusion-python``
37-
crate as a dependency, provide a ``PyTable``, and then to register it with the
37+
crate as a dependency, produce a DataFusion table in Rust, and then register it with the
3838
``SessionContext``. Unfortunately, this will not work.
3939

4040
When you produce your code as a Python library and it needs to interact with the DataFusion

docs/source/user-guide/data-sources.rst

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,13 +152,22 @@ as Delta Lake. This will require a recent version of
152152
.. code-block:: python
153153
154154
from deltalake import DeltaTable
155+
from datafusion import TableProvider
155156
156157
delta_table = DeltaTable("path_to_table")
157-
ctx.register_table_provider("my_delta_table", delta_table)
158+
provider = TableProvider.from_capsule(delta_table.__datafusion_table_provider__())
159+
ctx.register_table("my_delta_table", provider)
158160
df = ctx.table("my_delta_table")
159161
df.show()
160162
161-
On older versions of ``deltalake`` (prior to 0.22) you can use the
163+
.. note::
164+
165+
:py:meth:`~datafusion.context.SessionContext.register_table_provider` is
166+
deprecated. Use
167+
:py:meth:`~datafusion.context.SessionContext.register_table` with a
168+
:py:class:`~datafusion.TableProvider` instead.
169+
170+
On older versions of ``deltalake`` (prior to 0.22) you can use the
162171
`Arrow DataSet <https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Dataset.html>`_
163172
interface to import to DataFusion, but this does not support features such as filter push down
164173
which can lead to a significant performance difference.

docs/source/user-guide/io/table_provider.rst

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,20 +39,47 @@ A complete example can be found in the `examples folder <https://github.com/apac
3939
) -> PyResult<Bound<'py, PyCapsule>> {
4040
let name = CString::new("datafusion_table_provider").unwrap();
4141
42-
let provider = Arc::new(self.clone())
43-
.map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
44-
let provider = FFI_TableProvider::new(Arc::new(provider), false);
42+
let provider = Arc::new(self.clone());
43+
let provider = FFI_TableProvider::new(provider, false, None);
4544
4645
PyCapsule::new_bound(py, provider, Some(name.clone()))
4746
}
4847
}
4948
50-
Once you have this library available, in python you can register your table provider
51-
to the ``SessionContext``.
49+
Once you have this library available, you can construct a
50+
:py:class:`~datafusion.TableProvider` in Python and register it with the
51+
``SessionContext``. Table providers can be created either from the PyCapsule exposed by
52+
your Rust provider or from an existing :py:class:`~datafusion.dataframe.DataFrame`.
53+
Call the provider's ``__datafusion_table_provider__()`` method to obtain the capsule
54+
before constructing a ``TableProvider``. The ``TableProvider.from_view()`` helper is
55+
deprecated; instead use ``TableProvider.from_dataframe()`` or ``DataFrame.into_view()``.
56+
57+
.. note::
58+
59+
:py:meth:`~datafusion.context.SessionContext.register_table_provider` is
60+
deprecated. Use
61+
:py:meth:`~datafusion.context.SessionContext.register_table` with the
62+
resulting :py:class:`~datafusion.TableProvider` instead.
5263

5364
.. code-block:: python
5465
66+
from datafusion import SessionContext, TableProvider
67+
68+
ctx = SessionContext()
5569
provider = MyTableProvider()
56-
ctx.register_table_provider("my_table", provider)
5770
58-
ctx.table("my_table").show()
71+
capsule = provider.__datafusion_table_provider__()
72+
capsule_provider = TableProvider.from_capsule(capsule)
73+
74+
df = ctx.from_pydict({"a": [1]})
75+
view_provider = TableProvider.from_dataframe(df)
76+
# or: view_provider = df.into_view()
77+
78+
ctx.register_table("capsule_table", capsule_provider)
79+
ctx.register_table("view_table", view_provider)
80+
81+
ctx.table("capsule_table").show()
82+
ctx.table("view_table").show()
83+
84+
Both ``TableProvider.from_capsule()`` and ``TableProvider.from_dataframe()`` create
85+
table providers that can be registered with the SessionContext using ``register_table()``.

examples/datafusion-ffi-example/python/tests/_test_table_function.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def test_ffi_table_function_call_directly():
5353
table_udtf = udtf(table_func, "my_table_func")
5454

5555
my_table = table_udtf()
56-
ctx.register_table_provider("t", my_table)
56+
ctx.register_table("t", my_table)
5757
result = ctx.table("t").collect()
5858

5959
assert len(result) == 2

examples/datafusion-ffi-example/python/tests/_test_table_provider.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,16 @@
1818
from __future__ import annotations
1919

2020
import pyarrow as pa
21-
from datafusion import SessionContext
21+
from datafusion import SessionContext, TableProvider
2222
from datafusion_ffi_example import MyTableProvider
2323

2424

2525
def test_table_loading():
2626
ctx = SessionContext()
2727
table = MyTableProvider(3, 2, 4)
28-
ctx.register_table_provider("t", table)
28+
ctx.register_table(
29+
"t", TableProvider.from_capsule(table.__datafusion_table_provider__())
30+
)
2931
result = ctx.table("t").collect()
3032

3133
assert len(result) == 4

python/datafusion/__init__.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,16 @@
2828
try:
2929
import importlib.metadata as importlib_metadata
3030
except ImportError:
31-
import importlib_metadata
31+
import importlib_metadata # type: ignore[import]
3232

33+
# Public submodules
3334
from . import functions, object_store, substrait, unparser
3435

3536
# The following imports are okay to remain as opaque to the user.
36-
from ._internal import Config
37+
from ._internal import EXPECTED_PROVIDER_MSG, Config
3738
from .catalog import Catalog, Database, Table
3839
from .col import col, column
39-
from .common import (
40-
DFSchema,
41-
)
40+
from .common import DFSchema
4241
from .context import (
4342
RuntimeEnvBuilder,
4443
SessionConfig,
@@ -47,13 +46,11 @@
4746
)
4847
from .dataframe import DataFrame, ParquetColumnOptions, ParquetWriterOptions
4948
from .dataframe_formatter import configure_formatter
50-
from .expr import (
51-
Expr,
52-
WindowFrame,
53-
)
49+
from .expr import Expr, WindowFrame
5450
from .io import read_avro, read_csv, read_json, read_parquet
5551
from .plan import ExecutionPlan, LogicalPlan
5652
from .record_batch import RecordBatch, RecordBatchStream
53+
from .table_provider import TableProvider
5754
from .user_defined import (
5855
Accumulator,
5956
AggregateUDF,
@@ -69,6 +66,7 @@
6966
__version__ = importlib_metadata.version(__name__)
7067

7168
__all__ = [
69+
"EXPECTED_PROVIDER_MSG",
7270
"Accumulator",
7371
"AggregateUDF",
7472
"Catalog",
@@ -90,6 +88,7 @@
9088
"SessionContext",
9189
"Table",
9290
"TableFunction",
91+
"TableProvider",
9392
"WindowFrame",
9493
"WindowUDF",
9594
"catalog",

python/datafusion/catalog.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,14 @@
2323
from typing import TYPE_CHECKING, Protocol
2424

2525
import datafusion._internal as df_internal
26+
from datafusion.utils import _normalize_table_provider
2627

2728
if TYPE_CHECKING:
2829
import pyarrow as pa
2930

31+
from datafusion import TableProvider
32+
from datafusion.context import TableProviderExportable
33+
3034
try:
3135
from warnings import deprecated # Python 3.13+
3236
except ImportError:
@@ -82,7 +86,11 @@ def database(self, name: str = "public") -> Schema:
8286
"""Returns the database with the given ``name`` from this catalog."""
8387
return self.schema(name)
8488

85-
def register_schema(self, name, schema) -> Schema | None:
89+
def register_schema(
90+
self,
91+
name: str,
92+
schema: Schema | SchemaProvider | SchemaProviderExportable,
93+
) -> Schema | None:
8694
"""Register a schema with this catalog."""
8795
if isinstance(schema, Schema):
8896
return self.catalog.register_schema(name, schema._raw_schema)
@@ -122,11 +130,16 @@ def table(self, name: str) -> Table:
122130
"""Return the table with the given ``name`` from this schema."""
123131
return Table(self._raw_schema.table(name))
124132

125-
def register_table(self, name, table) -> None:
126-
"""Register a table provider in this schema."""
127-
if isinstance(table, Table):
128-
return self._raw_schema.register_table(name, table.table)
129-
return self._raw_schema.register_table(name, table)
133+
def register_table(
134+
self, name: str, table: Table | TableProvider | TableProviderExportable
135+
) -> None:
136+
"""Register a table or table provider in this schema.
137+
138+
Objects implementing ``__datafusion_table_provider__`` are also supported
139+
and treated as :class:`TableProvider` instances.
140+
"""
141+
provider = _normalize_table_provider(table)
142+
return self._raw_schema.register_table(name, provider)
130143

131144
def deregister_table(self, name: str) -> None:
132145
"""Deregister a table provider from this schema."""
@@ -219,14 +232,19 @@ def table(self, name: str) -> Table | None:
219232
"""Retrieve a specific table from this schema."""
220233
...
221234

222-
def register_table(self, name: str, table: Table) -> None: # noqa: B027
223-
"""Add a table from this schema.
235+
def register_table( # noqa: B027
236+
self, name: str, table: Table | TableProvider | TableProviderExportable
237+
) -> None:
238+
"""Add a table to this schema.
224239
225240
This method is optional. If your schema provides a fixed list of tables, you do
226241
not need to implement this method.
242+
243+
Objects implementing ``__datafusion_table_provider__`` are also supported
244+
and treated as :class:`TableProvider` instances.
227245
"""
228246

229-
def deregister_table(self, name, cascade: bool) -> None: # noqa: B027
247+
def deregister_table(self, name: str, cascade: bool) -> None: # noqa: B027
230248
"""Remove a table from this schema.
231249
232250
This method is optional. If your schema provides a fixed list of tables, you do

python/datafusion/context.py

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,11 @@
2929

3030
import pyarrow as pa
3131

32-
from datafusion.catalog import Catalog, CatalogProvider, Table
32+
from datafusion.catalog import Catalog
3333
from datafusion.dataframe import DataFrame
34-
from datafusion.expr import SortKey, sort_list_to_raw_sort_list
34+
from datafusion.expr import sort_list_to_raw_sort_list
3535
from datafusion.record_batch import RecordBatchStream
36-
from datafusion.user_defined import AggregateUDF, ScalarUDF, TableFunction, WindowUDF
36+
from datafusion.utils import _normalize_table_provider
3737

3838
from ._internal import RuntimeEnvBuilder as RuntimeEnvBuilderInternal
3939
from ._internal import SessionConfig as SessionConfigInternal
@@ -48,7 +48,16 @@
4848
import pandas as pd
4949
import polars as pl # type: ignore[import]
5050

51+
from datafusion import TableProvider
52+
from datafusion.catalog import CatalogProvider, Table
53+
from datafusion.expr import SortKey
5154
from datafusion.plan import ExecutionPlan, LogicalPlan
55+
from datafusion.user_defined import (
56+
AggregateUDF,
57+
ScalarUDF,
58+
TableFunction,
59+
WindowUDF,
60+
)
5261

5362

5463
class ArrowStreamExportable(Protocol):
@@ -733,7 +742,7 @@ def from_polars(self, data: pl.DataFrame, name: str | None = None) -> DataFrame:
733742
# https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116
734743
# is the discussion on how we arrived at adding register_view
735744
def register_view(self, name: str, df: DataFrame) -> None:
736-
"""Register a :py:class: `~datafusion.detaframe.DataFrame` as a view.
745+
"""Register a :py:class:`~datafusion.dataframe.DataFrame` as a view.
737746
738747
Args:
739748
name (str): The name to register the view under.
@@ -742,16 +751,29 @@ def register_view(self, name: str, df: DataFrame) -> None:
742751
view = df.into_view()
743752
self.ctx.register_table(name, view)
744753

745-
def register_table(self, name: str, table: Table) -> None:
746-
"""Register a :py:class: `~datafusion.catalog.Table` as a table.
754+
def register_table(
755+
self, name: str, table: Table | TableProvider | TableProviderExportable
756+
) -> None:
757+
"""Register a Table or TableProvider.
747758
748-
The registered table can be referenced from SQL statement executed against.
759+
The registered table can be referenced from SQL statements executed against
760+
this context.
761+
762+
Plain :py:class:`~datafusion.dataframe.DataFrame` objects are not supported;
763+
convert them first with :meth:`datafusion.dataframe.DataFrame.into_view` or
764+
:meth:`datafusion.TableProvider.from_dataframe`.
765+
766+
Objects implementing ``__datafusion_table_provider__`` are also supported
767+
and treated as :py:class:`~datafusion.TableProvider` instances.
749768
750769
Args:
751770
name: Name of the resultant table.
752-
table: DataFusion table to add to the session context.
771+
table: DataFusion :class:`Table`, :class:`TableProvider`, or any object
772+
implementing ``__datafusion_table_provider__`` to add to the session
773+
context.
753774
"""
754-
self.ctx.register_table(name, table.table)
775+
provider = _normalize_table_provider(table)
776+
self.ctx.register_table(name, provider)
755777

756778
def deregister_table(self, name: str) -> None:
757779
"""Remove a table from the session."""
@@ -771,14 +793,21 @@ def register_catalog_provider(
771793
self.ctx.register_catalog_provider(name, provider)
772794

773795
def register_table_provider(
774-
self, name: str, provider: TableProviderExportable
796+
self, name: str, provider: Table | TableProvider | TableProviderExportable
775797
) -> None:
776798
"""Register a table provider.
777799
778-
This table provider must have a method called ``__datafusion_table_provider__``
779-
which returns a PyCapsule that exposes a ``FFI_TableProvider``.
800+
Deprecated: use :meth:`register_table` instead.
801+
802+
Objects implementing ``__datafusion_table_provider__`` are also supported
803+
and treated as :py:class:`~datafusion.TableProvider` instances.
780804
"""
781-
self.ctx.register_table_provider(name, provider)
805+
warnings.warn(
806+
"register_table_provider is deprecated; use register_table",
807+
DeprecationWarning,
808+
stacklevel=2,
809+
)
810+
self.register_table(name, provider)
782811

783812
def register_udtf(self, func: TableFunction) -> None:
784813
"""Register a user defined table function."""

0 commit comments

Comments
 (0)