add pytableprovider

kosiew · kosiew · commit 54e893814c59 · 2025-02-06T16:15:57.000+08:00
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
@@ -636,7 +636,7 @@ def from_pylist(
 
     def from_pydict(
         self, data: dict[str, list[Any]], name: str | None = None
-    ) -> DataFramee
+    ) -> DataFrame:
         """Create a :py:class:`~datafusion.dataframe.DataFrame` from a dictionary.
 
         Args:
diff --git a/python/tests/test_view.py b/python/tests/test_view.py
@@ -22,10 +22,10 @@ def test_register_filtered_dataframe():
 
     # Filter the DataFrame (for example, keep rows where a > 2)
     df_filtered = df.filter(col("a") > literal(2))
-    df_filtered = df_filtered.into_view()
+    view = df_filtered.into_view()
 
     # Register the filtered DataFrame as a table called "view1"
-    ctx.register_table("view1", df_filtered)
+    ctx.register_table("view1", view)
 
     # Now run a SQL query against the registered table "view1"
     df_view = ctx.sql("SELECT * FROM view1")
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -30,10 +30,10 @@ use datafusion::arrow::util::pretty;
 use datafusion::common::UnnestOptions;
 use datafusion::config::{CsvOptions, TableParquetOptions};
 use datafusion::dataframe::{DataFrame, DataFrameWriteOptions};
+use datafusion::datasource::TableProvider;
 use datafusion::execution::SendableRecordBatchStream;
 use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel};
 use datafusion::prelude::*;
-use datafusion::sql::sqlparser::ast::Table;
 use pyo3::exceptions::PyValueError;
 use pyo3::prelude::*;
 use pyo3::pybacked::PyBackedStr;
@@ -51,6 +51,21 @@ use crate::{
     expr::{sort_expr::PySortExpr, PyExpr},
 };
 
+#[pyclass(name = "TableProvider", module = "datafusion")]
+pub struct PyTableProvider {
+    provider: Arc<dyn TableProvider>,
+}
+
+impl PyTableProvider {
+    pub fn new(provider: Arc<dyn TableProvider>) -> Self {
+        Self { provider }
+    }
+
+    pub fn get_provider(&self) -> Arc<dyn TableProvider> {
+        self.provider.clone()
+    }
+}
+
 /// A PyDataFrame is a representation of a logical plan and an API to compose statements.
 /// Use it to build a plan and `.collect()` to execute the plan and collect the result.
 /// The actual execution of a plan runs natively on Rust and Arrow on a multi-threaded environment.
@@ -90,6 +105,15 @@ impl PyDataFrame {
         }
     }
 
+    /// Convert this DataFrame into a view (i.e. a TableProvider) that can be registered.
+    fn into_view(&self) -> PyDataFusionResult<PyTableProvider> {
+        // Call the underlying Rust DataFrame::into_view method.
+        // Note that the Rust method consumes self; here we clone the inner Arc<DataFrame>
+        // so that we don’t invalidate this PyDataFrame.
+        let table_provider = self.df.as_ref().clone().into_view();
+        Ok(PyTableProvider::new(table_provider))
+    }
+
     fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
         let df = self.df.as_ref().clone().limit(0, Some(10))?;
         let batches = wait_for_future(py, df.collect())?;