Skip to content

Commit 8ce157f

Browse files
committed
Revert "UNPICK changes to review"
This reverts commit d0eb360.
1 parent d0eb360 commit 8ce157f

File tree

23 files changed

+701
-204
lines changed

23 files changed

+701
-204
lines changed

docs/source/conf.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,9 @@ def autoapi_skip_member_fn(app, what, name, obj, skip, options) -> bool: # noqa
8383
# Duplicate modules (skip module-level docs to avoid duplication)
8484
("module", "datafusion.col"),
8585
("module", "datafusion.udf"),
86+
# Private variables causing duplicate documentation
87+
("data", "datafusion.utils._PYARROW_DATASET_TYPES"),
88+
("variable", "datafusion.utils._PYARROW_DATASET_TYPES"),
8689
# Deprecated
8790
("class", "datafusion.substrait.serde"),
8891
("class", "datafusion.substrait.plan"),
@@ -91,9 +94,28 @@ def autoapi_skip_member_fn(app, what, name, obj, skip, options) -> bool: # noqa
9194
("method", "datafusion.context.SessionContext.tables"),
9295
("method", "datafusion.dataframe.DataFrame.unnest_column"),
9396
]
97+
# Explicitly skip certain members listed above. These are either
98+
# re-exports, duplicate module-level documentation, deprecated
99+
# API surfaces, or private variables that would otherwise appear
100+
# in the generated docs and cause confusing duplication.
101+
# Keeping this explicit list avoids surprising entries in the
102+
# AutoAPI output and gives us a single place to opt-out items
103+
# when we intentionally hide them from the docs.
94104
if (what, name) in skip_contents:
95105
skip = True
96106

107+
# Skip private module-level names (those whose final component
108+
# starts with an underscore) when AutoAPI is rendering data or
109+
# variable entries. Many internal module-level constants are
110+
# implementation details (for example private pyarrow dataset type
111+
# mappings) that would otherwise be emitted as top-level "data"
112+
# or "variable" docs. Filtering them here avoids noisy,
113+
# duplicate, or implementation-specific entries in the public
114+
# documentation while still allowing public members and types to
115+
# be documented normally.
116+
if name.split(".")[-1].startswith("_") and what in ("data", "variable"):
117+
skip = True
118+
97119
return skip
98120

99121

docs/source/contributor-guide/ffi.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ as performant as possible and to utilize the features of DataFusion, you may dec
3434
your source in Rust and then expose it through `PyO3 <https://pyo3.rs>`_ as a Python library.
3535

3636
At first glance, it may appear the best way to do this is to add the ``datafusion-python``
37-
crate as a dependency, provide a ``PyTable``, and then to register it with the
37+
crate as a dependency, produce a DataFusion table in Rust, and then register it with the
3838
``SessionContext``. Unfortunately, this will not work.
3939

4040
When you produce your code as a Python library and it needs to interact with the DataFusion

docs/source/user-guide/data-sources.rst

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,13 +152,26 @@ as Delta Lake. This will require a recent version of
152152
.. code-block:: python
153153
154154
from deltalake import DeltaTable
155+
from datafusion import Table
155156
156157
delta_table = DeltaTable("path_to_table")
157-
ctx.register_table_provider("my_delta_table", delta_table)
158+
table = Table.from_capsule(delta_table.__datafusion_table_provider__())
159+
ctx.register_table("my_delta_table", table)
158160
df = ctx.table("my_delta_table")
159161
df.show()
160162
161-
On older versions of ``deltalake`` (prior to 0.22) you can use the
163+
Objects that implement ``__datafusion_table_provider__`` are supported directly by
164+
:py:meth:`~datafusion.context.SessionContext.register_table`, making it easy to
165+
work with custom table providers from Python libraries such as Delta Lake.
166+
167+
.. note::
168+
169+
:py:meth:`~datafusion.context.SessionContext.register_table_provider` is
170+
deprecated. Use
171+
:py:meth:`~datafusion.context.SessionContext.register_table` with a
172+
:py:class:`~datafusion.Table` instead.
173+
174+
On older versions of ``deltalake`` (prior to 0.22) you can use the
162175
`Arrow DataSet <https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Dataset.html>`_
163176
interface to import to DataFusion, but this does not support features such as filter push down
164177
which can lead to a significant performance difference.

docs/source/user-guide/io/table_provider.rst

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,20 +39,47 @@ A complete example can be found in the `examples folder <https://github.com/apac
3939
) -> PyResult<Bound<'py, PyCapsule>> {
4040
let name = CString::new("datafusion_table_provider").unwrap();
4141
42-
let provider = Arc::new(self.clone())
43-
.map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
44-
let provider = FFI_TableProvider::new(Arc::new(provider), false);
42+
let provider = Arc::new(self.clone());
43+
let provider = FFI_TableProvider::new(provider, false, None);
4544
4645
PyCapsule::new_bound(py, provider, Some(name.clone()))
4746
}
4847
}
4948
50-
Once you have this library available, in python you can register your table provider
51-
to the ``SessionContext``.
49+
Once you have this library available, you can construct a
50+
:py:class:`~datafusion.Table` in Python and register it with the
51+
``SessionContext``. Tables can be created either from the PyCapsule exposed by your
52+
Rust provider or from an existing :py:class:`~datafusion.dataframe.DataFrame`.
53+
Call the provider's ``__datafusion_table_provider__()`` method to obtain the capsule
54+
before constructing a ``Table``. The ``Table.from_view()`` helper is
55+
deprecated; instead use ``Table.from_dataframe()`` or ``DataFrame.into_view()``.
56+
57+
.. note::
58+
59+
:py:meth:`~datafusion.context.SessionContext.register_table_provider` is
60+
deprecated. Use
61+
:py:meth:`~datafusion.context.SessionContext.register_table` with the
62+
resulting :py:class:`~datafusion.Table` instead.
5263

5364
.. code-block:: python
5465
66+
from datafusion import SessionContext, Table
67+
68+
ctx = SessionContext()
5569
provider = MyTableProvider()
56-
ctx.register_table_provider("my_table", provider)
5770
58-
ctx.table("my_table").show()
71+
capsule = provider.__datafusion_table_provider__()
72+
capsule_table = Table.from_capsule(capsule)
73+
74+
df = ctx.from_pydict({"a": [1]})
75+
view_table = Table.from_dataframe(df)
76+
# or: view_table = df.into_view()
77+
78+
ctx.register_table("capsule_table", capsule_table)
79+
ctx.register_table("view_table", view_table)
80+
81+
ctx.table("capsule_table").show()
82+
ctx.table("view_table").show()
83+
84+
Both ``Table.from_capsule()`` and ``Table.from_dataframe()`` create
85+
table providers that can be registered with the SessionContext using ``register_table()``.

examples/datafusion-ffi-example/python/tests/_test_table_function.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def test_ffi_table_function_call_directly():
5353
table_udtf = udtf(table_func, "my_table_func")
5454

5555
my_table = table_udtf()
56-
ctx.register_table_provider("t", my_table)
56+
ctx.register_table("t", my_table)
5757
result = ctx.table("t").collect()
5858

5959
assert len(result) == 2

examples/datafusion-ffi-example/python/tests/_test_table_provider.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,16 @@
1818
from __future__ import annotations
1919

2020
import pyarrow as pa
21-
from datafusion import SessionContext
21+
from datafusion import SessionContext, TableProvider
2222
from datafusion_ffi_example import MyTableProvider
2323

2424

2525
def test_table_loading():
2626
ctx = SessionContext()
2727
table = MyTableProvider(3, 2, 4)
28-
ctx.register_table_provider("t", table)
28+
ctx.register_table(
29+
"t", TableProvider.from_capsule(table.__datafusion_table_provider__())
30+
)
2931
result = ctx.table("t").collect()
3032

3133
assert len(result) == 4

python/datafusion/__init__.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,16 @@
2828
try:
2929
import importlib.metadata as importlib_metadata
3030
except ImportError:
31-
import importlib_metadata
31+
import importlib_metadata # type: ignore[import]
3232

33+
# Public submodules
3334
from . import functions, object_store, substrait, unparser
3435

3536
# The following imports are okay to remain as opaque to the user.
36-
from ._internal import Config
37+
from ._internal import EXPECTED_PROVIDER_MSG, Config
3738
from .catalog import Catalog, Database, Table
3839
from .col import col, column
39-
from .common import (
40-
DFSchema,
41-
)
40+
from .common import DFSchema
4241
from .context import (
4342
RuntimeEnvBuilder,
4443
SessionConfig,
@@ -47,10 +46,7 @@
4746
)
4847
from .dataframe import DataFrame, ParquetColumnOptions, ParquetWriterOptions
4948
from .dataframe_formatter import configure_formatter
50-
from .expr import (
51-
Expr,
52-
WindowFrame,
53-
)
49+
from .expr import Expr, WindowFrame
5450
from .io import read_avro, read_csv, read_json, read_parquet
5551
from .plan import ExecutionPlan, LogicalPlan
5652
from .record_batch import RecordBatch, RecordBatchStream
@@ -69,6 +65,7 @@
6965
__version__ = importlib_metadata.version(__name__)
7066

7167
__all__ = [
68+
"EXPECTED_PROVIDER_MSG",
7269
"Accumulator",
7370
"AggregateUDF",
7471
"Catalog",

python/datafusion/catalog.py

Lines changed: 110 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,19 @@
1919

2020
from __future__ import annotations
2121

22+
import warnings
2223
from abc import ABC, abstractmethod
23-
from typing import TYPE_CHECKING, Protocol
24+
from typing import TYPE_CHECKING, Any, Protocol
2425

2526
import datafusion._internal as df_internal
27+
from datafusion._internal import EXPECTED_PROVIDER_MSG
28+
from datafusion.utils import _normalize_table_provider
2629

2730
if TYPE_CHECKING:
2831
import pyarrow as pa
2932

33+
from datafusion.context import TableProviderExportable
34+
3035
try:
3136
from warnings import deprecated # Python 3.13+
3237
except ImportError:
@@ -82,7 +87,11 @@ def database(self, name: str = "public") -> Schema:
8287
"""Returns the database with the given ``name`` from this catalog."""
8388
return self.schema(name)
8489

85-
def register_schema(self, name, schema) -> Schema | None:
90+
def register_schema(
91+
self,
92+
name: str,
93+
schema: Schema | SchemaProvider | SchemaProviderExportable,
94+
) -> Schema | None:
8695
"""Register a schema with this catalog."""
8796
if isinstance(schema, Schema):
8897
return self.catalog.register_schema(name, schema._raw_schema)
@@ -122,11 +131,16 @@ def table(self, name: str) -> Table:
122131
"""Return the table with the given ``name`` from this schema."""
123132
return Table(self._raw_schema.table(name))
124133

125-
def register_table(self, name, table) -> None:
126-
"""Register a table provider in this schema."""
127-
if isinstance(table, Table):
128-
return self._raw_schema.register_table(name, table.table)
129-
return self._raw_schema.register_table(name, table)
134+
def register_table(
135+
self, name: str, table: Table | TableProviderExportable | Any
136+
) -> None:
137+
"""Register a table or table provider in this schema.
138+
139+
Objects implementing ``__datafusion_table_provider__`` are also supported
140+
and treated as table provider instances.
141+
"""
142+
provider = _normalize_table_provider(table)
143+
return self._raw_schema.register_table(name, provider)
130144

131145
def deregister_table(self, name: str) -> None:
132146
"""Deregister a table provider from this schema."""
@@ -138,31 +152,101 @@ class Database(Schema):
138152
"""See `Schema`."""
139153

140154

155+
_InternalRawTable = df_internal.catalog.RawTable
156+
_InternalTableProvider = df_internal.TableProvider
157+
158+
# Keep in sync with ``datafusion._internal.TableProvider.from_view``.
159+
_FROM_VIEW_WARN_STACKLEVEL = 2
160+
161+
141162
class Table:
142-
"""DataFusion table."""
163+
"""DataFusion table or table provider wrapper."""
143164

144-
def __init__(self, table: df_internal.catalog.RawTable) -> None:
145-
"""This constructor is not typically called by the end user."""
146-
self.table = table
165+
__slots__ = ("_table",)
166+
167+
def __init__(
168+
self,
169+
table: _InternalRawTable | _InternalTableProvider | Table,
170+
) -> None:
171+
"""Wrap a low level table or table provider."""
172+
if isinstance(table, Table):
173+
table = table.table
174+
175+
if not isinstance(table, (_InternalRawTable, _InternalTableProvider)):
176+
raise TypeError(EXPECTED_PROVIDER_MSG)
177+
178+
self._table = table
179+
180+
def __getattribute__(self, name: str) -> Any:
181+
"""Restrict provider-specific helpers to compatible tables."""
182+
if name == "__datafusion_table_provider__":
183+
table = object.__getattribute__(self, "_table")
184+
if not hasattr(table, "__datafusion_table_provider__"):
185+
raise AttributeError(name)
186+
return object.__getattribute__(self, name)
147187

148188
def __repr__(self) -> str:
149189
"""Print a string representation of the table."""
150-
return self.table.__repr__()
190+
return repr(self._table)
151191

152-
@staticmethod
153-
def from_dataset(dataset: pa.dataset.Dataset) -> Table:
154-
"""Turn a pyarrow Dataset into a Table."""
155-
return Table(df_internal.catalog.RawTable.from_dataset(dataset))
192+
@property
193+
def table(self) -> _InternalRawTable | _InternalTableProvider:
194+
"""Return the wrapped low level table object."""
195+
return self._table
196+
197+
@classmethod
198+
def from_dataset(cls, dataset: pa.dataset.Dataset) -> Table:
199+
"""Turn a :mod:`pyarrow.dataset` ``Dataset`` into a :class:`Table`."""
200+
return cls(_InternalRawTable.from_dataset(dataset))
201+
202+
@classmethod
203+
def from_capsule(cls, capsule: Any) -> Table:
204+
"""Create a :class:`Table` from a PyCapsule exported provider."""
205+
provider = _InternalTableProvider.from_capsule(capsule)
206+
return cls(provider)
207+
208+
@classmethod
209+
def from_dataframe(cls, df: Any) -> Table:
210+
"""Create a :class:`Table` from tabular data."""
211+
from datafusion.dataframe import DataFrame as DataFrameWrapper
212+
213+
dataframe = df if isinstance(df, DataFrameWrapper) else DataFrameWrapper(df)
214+
return dataframe.into_view()
215+
216+
@classmethod
217+
def from_view(cls, df: Any) -> Table:
218+
"""Deprecated helper for constructing tables from views."""
219+
from datafusion.dataframe import DataFrame as DataFrameWrapper
220+
221+
if isinstance(df, DataFrameWrapper):
222+
df = df.df
223+
224+
provider = _InternalTableProvider.from_view(df)
225+
warnings.warn(
226+
"Table.from_view is deprecated; use DataFrame.into_view or "
227+
"Table.from_dataframe instead.",
228+
category=DeprecationWarning,
229+
stacklevel=_FROM_VIEW_WARN_STACKLEVEL,
230+
)
231+
return cls(provider)
156232

157233
@property
158234
def schema(self) -> pa.Schema:
159235
"""Returns the schema associated with this table."""
160-
return self.table.schema
236+
return self._table.schema
161237

162238
@property
163239
def kind(self) -> str:
164240
"""Returns the kind of table."""
165-
return self.table.kind
241+
return self._table.kind
242+
243+
def __datafusion_table_provider__(self) -> Any:
244+
"""Expose the wrapped provider for FFI integrations."""
245+
exporter = getattr(self._table, "__datafusion_table_provider__", None)
246+
if exporter is None:
247+
msg = "Underlying object does not export __datafusion_table_provider__()"
248+
raise AttributeError(msg)
249+
return exporter()
166250

167251

168252
class CatalogProvider(ABC):
@@ -219,14 +303,19 @@ def table(self, name: str) -> Table | None:
219303
"""Retrieve a specific table from this schema."""
220304
...
221305

222-
def register_table(self, name: str, table: Table) -> None: # noqa: B027
223-
"""Add a table from this schema.
306+
def register_table( # noqa: B027
307+
self, name: str, table: Table | TableProviderExportable | Any
308+
) -> None:
309+
"""Add a table to this schema.
224310
225311
This method is optional. If your schema provides a fixed list of tables, you do
226312
not need to implement this method.
313+
314+
Objects implementing ``__datafusion_table_provider__`` are also supported
315+
and treated as table provider instances.
227316
"""
228317

229-
def deregister_table(self, name, cascade: bool) -> None: # noqa: B027
318+
def deregister_table(self, name: str, cascade: bool) -> None: # noqa: B027
230319
"""Remove a table from this schema.
231320
232321
This method is optional. If your schema provides a fixed list of tables, you do

0 commit comments

Comments
 (0)