kosiew
diff --git a/‎docs/source/user-guide/dataframe/index.rst‎
Lines changed: 0 additions & 45 deletions b/‎docs/source/user-guide/dataframe/index.rst‎
Lines changed: 0 additions & 45 deletions
diff --git a/‎python/datafusion/context.py‎
Lines changed: 21 additions & 34 deletions b/‎python/datafusion/context.py‎
Lines changed: 21 additions & 34 deletions
diff --git a/‎python/datafusion/dataframe.py‎
Lines changed: 28 additions & 62 deletions b/‎python/datafusion/dataframe.py‎
Lines changed: 28 additions & 62 deletions
@@ -126,51 +126,6 @@ DataFusion's DataFrame API offers a wide range of operations:
     # Drop columns
     df = df.drop("temporary_column")
 
-String Columns and Expressions
-------------------------------
-
-Some ``DataFrame`` methods accept plain strings when an argument refers to an
-existing column. These include:
-
-* :py:meth:`~datafusion.DataFrame.select`
-* :py:meth:`~datafusion.DataFrame.sort`
-* :py:meth:`~datafusion.DataFrame.drop`
-* :py:meth:`~datafusion.DataFrame.join` (``on`` argument)
-* :py:meth:`~datafusion.DataFrame.aggregate` (grouping columns)
-
-For such methods, you can pass column names directly:
-
-.. code-block:: python
-
-    from datafusion import col, functions as f
-
-    df.sort('id')
-    df.aggregate('id', [f.count(col('value'))])
-
-The same operation can also be written with explicit column expressions, using either ``col()`` or ``column()``:
-
-.. code-block:: python
-
-    from datafusion import col, column, functions as f
-
-    df.sort(col('id'))
-    df.aggregate(column('id'), [f.count(col('value'))])
-
-Note that ``column()`` is an alias of ``col()``, so you can use either name; the example above shows both in action.
-
-Whenever an argument represents an expression—such as in
-:py:meth:`~datafusion.DataFrame.filter` or
-:py:meth:`~datafusion.DataFrame.with_column`—use ``col()`` to reference columns
-and wrap constant values with ``lit()`` (also available as ``literal()``):
-
-.. code-block:: python
-
-    from datafusion import col, lit
-    df.filter(col('age') > lit(21))
-
-Without ``lit()`` DataFusion would treat ``21`` as a column name rather than a
-constant value.
-
 Terminal Operations
 -------------------
 
 
@@ -31,7 +31,7 @@
 
 from datafusion.catalog import Catalog, CatalogProvider, Table
 from datafusion.dataframe import DataFrame
-from datafusion.expr import SortKey, sort_list_to_raw_sort_list
+from datafusion.expr import Expr, SortExpr, sort_list_to_raw_sort_list
 from datafusion.record_batch import RecordBatchStream
 from datafusion.user_defined import AggregateUDF, ScalarUDF, TableFunction, WindowUDF
 
@@ -553,7 +553,7 @@ def register_listing_table(
         table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
         file_extension: str = ".parquet",
         schema: pa.Schema | None = None,
-        file_sort_order: list[list[SortKey]] | None = None,
+        file_sort_order: list[list[Expr | SortExpr]] | None = None,
     ) -> None:
         """Register multiple files as a single table.
 
@@ -567,20 +567,23 @@ def register_listing_table(
             table_partition_cols: Partition columns.
             file_extension: File extension of the provided table.
             schema: The data source schema.
-            file_sort_order: Sort order for the file. Each sort key can be
-                specified as a column name (``str``), an expression
-                (``Expr``), or a ``SortExpr``.
+            file_sort_order: Sort order for the file.
         """
         if table_partition_cols is None:
             table_partition_cols = []
         table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
+        file_sort_order_raw = (
+            [sort_list_to_raw_sort_list(f) for f in file_sort_order]
+            if file_sort_order is not None
+            else None
+        )
         self.ctx.register_listing_table(
             name,
             str(path),
             table_partition_cols,
             file_extension,
             schema,
-            self._convert_file_sort_order(file_sort_order),
+            file_sort_order_raw,
         )
 
     def sql(self, query: str, options: SQLOptions | None = None) -> DataFrame:
@@ -805,7 +808,7 @@ def register_parquet(
         file_extension: str = ".parquet",
         skip_metadata: bool = True,
         schema: pa.Schema | None = None,
-        file_sort_order: list[list[SortKey]] | None = None,
+        file_sort_order: list[list[SortExpr]] | None = None,
     ) -> None:
         """Register a Parquet file as a table.
 
@@ -824,9 +827,7 @@ def register_parquet(
                 that may be in the file schema. This can help avoid schema
                 conflicts due to metadata.
             schema: The data source schema.
-            file_sort_order: Sort order for the file. Each sort key can be
-                specified as a column name (``str``), an expression
-                (``Expr``), or a ``SortExpr``.
+            file_sort_order: Sort order for the file.
         """
         if table_partition_cols is None:
             table_partition_cols = []
@@ -839,7 +840,9 @@ def register_parquet(
             file_extension,
             skip_metadata,
             schema,
-            self._convert_file_sort_order(file_sort_order),
+            [sort_list_to_raw_sort_list(exprs) for exprs in file_sort_order]
+            if file_sort_order is not None
+            else None,
         )
 
     def register_csv(
@@ -1096,7 +1099,7 @@ def read_parquet(
         file_extension: str = ".parquet",
         skip_metadata: bool = True,
         schema: pa.Schema | None = None,
-        file_sort_order: list[list[SortKey]] | None = None,
+        file_sort_order: list[list[Expr | SortExpr]] | None = None,
     ) -> DataFrame:
         """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`.
 
@@ -1113,17 +1116,19 @@ def read_parquet(
             schema: An optional schema representing the parquet files. If None,
                 the parquet reader will try to infer it based on data in the
                 file.
-            file_sort_order: Sort order for the file. Each sort key can be
-                specified as a column name (``str``), an expression
-                (``Expr``), or a ``SortExpr``.
+            file_sort_order: Sort order for the file.
 
         Returns:
             DataFrame representation of the read Parquet files
         """
         if table_partition_cols is None:
             table_partition_cols = []
         table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
-        file_sort_order = self._convert_file_sort_order(file_sort_order)
+        file_sort_order = (
+            [sort_list_to_raw_sort_list(f) for f in file_sort_order]
+            if file_sort_order is not None
+            else None
+        )
         return DataFrame(
             self.ctx.read_parquet(
                 str(path),
@@ -1174,24 +1179,6 @@ def execute(self, plan: ExecutionPlan, partitions: int) -> RecordBatchStream:
         """Execute the ``plan`` and return the results."""
         return RecordBatchStream(self.ctx.execute(plan._raw_plan, partitions))
 
-    @staticmethod
-    def _convert_file_sort_order(
-        file_sort_order: list[list[SortKey]] | None,
-    ) -> list[list[Any]] | None:
-        """Convert nested ``SortKey`` lists into raw sort representations.
-
-        Each ``SortKey`` can be a column name string, an ``Expr``, or a
-        ``SortExpr`` and will be converted using
-        :func:`datafusion.expr.sort_list_to_raw_sort_list`.
-        """
-        # Convert each ``SortKey`` in the provided sort order to the low-level
-        # representation expected by the Rust bindings.
-        return (
-            [sort_list_to_raw_sort_list(f) for f in file_sort_order]
-            if file_sort_order is not None
-            else None
-        )
-
     @staticmethod
     def _convert_table_partition_cols(
         table_partition_cols: list[tuple[str, str | pa.DataType]],
 
@@ -40,13 +40,7 @@
 from datafusion._internal import DataFrame as DataFrameInternal
 from datafusion._internal import ParquetColumnOptions as ParquetColumnOptionsInternal
 from datafusion._internal import ParquetWriterOptions as ParquetWriterOptionsInternal
-from datafusion.expr import (
-    EXPR_TYPE_ERROR,
-    Expr,
-    SortKey,
-    expr_list_to_raw_expr_list,
-    sort_list_to_raw_sort_list,
-)
+from datafusion.expr import Expr, SortExpr, sort_or_default
 from datafusion.plan import ExecutionPlan, LogicalPlan
 from datafusion.record_batch import RecordBatchStream
 
@@ -292,23 +286,6 @@ def __init__(
         self.bloom_filter_ndv = bloom_filter_ndv
 
 
-def _ensure_expr(value: Expr) -> expr_internal.Expr:
-    """Return the internal expression or raise ``TypeError`` if invalid.
-
-    Args:
-        value: Candidate expression.
-
-    Returns:
-        The internal expression representation.
-
-    Raises:
-        TypeError: If ``value`` is not an instance of :class:`Expr`.
-    """
-    if not isinstance(value, Expr):
-        raise TypeError(EXPR_TYPE_ERROR)
-    return value.expr
-
-
 class DataFrame:
     """Two dimensional table representation of data.
 
@@ -417,7 +394,9 @@ def select(self, *exprs: Expr | str) -> DataFrame:
             df = df.select("a", col("b"), col("a").alias("alternate_a"))
 
         """
-        exprs_internal = expr_list_to_raw_expr_list(exprs)
+        exprs_internal = [
+            Expr.column(arg).expr if isinstance(arg, str) else arg.expr for arg in exprs
+        ]
         return DataFrame(self.df.select(*exprs_internal))
 
     def drop(self, *columns: str) -> DataFrame:
@@ -447,7 +426,7 @@ def filter(self, *predicates: Expr) -> DataFrame:
         """
         df = self.df
         for p in predicates:
-            df = df.filter(_ensure_expr(p))
+            df = df.filter(p.expr)
         return DataFrame(df)
 
     def with_column(self, name: str, expr: Expr) -> DataFrame:
@@ -460,7 +439,7 @@ def with_column(self, name: str, expr: Expr) -> DataFrame:
         Returns:
             DataFrame with the new column.
         """
-        return DataFrame(self.df.with_column(name, _ensure_expr(expr)))
+        return DataFrame(self.df.with_column(name, expr.expr))
 
     def with_columns(
         self, *exprs: Expr | Iterable[Expr], **named_exprs: Expr
@@ -489,24 +468,17 @@ def with_columns(
         def _simplify_expression(
             *exprs: Expr | Iterable[Expr], **named_exprs: Expr
         ) -> list[expr_internal.Expr]:
-            expr_list: list[expr_internal.Expr] = []
+            expr_list = []
             for expr in exprs:
-                if isinstance(expr, str):
-                    raise TypeError(EXPR_TYPE_ERROR)
-                if isinstance(expr, Iterable) and not isinstance(expr, Expr):
-                    expr_value = list(expr)
-                    if any(isinstance(inner, str) for inner in expr_value):
-                        raise TypeError(EXPR_TYPE_ERROR)
+                if isinstance(expr, Expr):
+                    expr_list.append(expr.expr)
+                elif isinstance(expr, Iterable):
+                    expr_list.extend(inner_expr.expr for inner_expr in expr)
                 else:
-                    expr_value = expr
-                try:
-                    expr_list.extend(expr_list_to_raw_expr_list(expr_value))
-                except TypeError as err:
-                    raise TypeError(EXPR_TYPE_ERROR) from err
-            for alias, expr in named_exprs.items():
-                if not isinstance(expr, Expr):
-                    raise TypeError(EXPR_TYPE_ERROR)
-                expr_list.append(expr.alias(alias).expr)
+                    raise NotImplementedError
+            if named_exprs:
+                for alias, expr in named_exprs.items():
+                    expr_list.append(expr.alias(alias).expr)
             return expr_list
 
         expressions = _simplify_expression(*exprs, **named_exprs)
@@ -531,43 +503,37 @@ def with_column_renamed(self, old_name: str, new_name: str) -> DataFrame:
         return DataFrame(self.df.with_column_renamed(old_name, new_name))
 
     def aggregate(
-        self,
-        group_by: list[Expr | str] | Expr | str,
-        aggs: list[Expr] | Expr,
+        self, group_by: list[Expr] | Expr, aggs: list[Expr] | Expr
     ) -> DataFrame:
         """Aggregates the rows of the current DataFrame.
 
         Args:
-            group_by: List of expressions or column names to group by.
+            group_by: List of expressions to group by.
             aggs: List of expressions to aggregate.
 
         Returns:
             DataFrame after aggregation.
         """
-        group_by_list = group_by if isinstance(group_by, list) else [group_by]
-        aggs_list = aggs if isinstance(aggs, list) else [aggs]
+        group_by = group_by if isinstance(group_by, list) else [group_by]
+        aggs = aggs if isinstance(aggs, list) else [aggs]
 
-        group_by_exprs = expr_list_to_raw_expr_list(group_by_list)
-        aggs_exprs = []
-        for agg in aggs_list:
-            if not isinstance(agg, Expr):
-                raise TypeError(EXPR_TYPE_ERROR)
-            aggs_exprs.append(agg.expr)
-        return DataFrame(self.df.aggregate(group_by_exprs, aggs_exprs))
+        group_by = [e.expr for e in group_by]
+        aggs = [e.expr for e in aggs]
+        return DataFrame(self.df.aggregate(group_by, aggs))
 
-    def sort(self, *exprs: SortKey) -> DataFrame:
-        """Sort the DataFrame by the specified sorting expressions or column names.
+    def sort(self, *exprs: Expr | SortExpr) -> DataFrame:
+        """Sort the DataFrame by the specified sorting expressions.
 
         Note that any expression can be turned into a sort expression by
-        calling its ``sort`` method.
+        calling its` ``sort`` method.
 
         Args:
-            exprs: Sort expressions or column names, applied in order.
+            exprs: Sort expressions, applied in order.
 
         Returns:
             DataFrame after sorting.
         """
-        exprs_raw = sort_list_to_raw_sort_list(list(exprs))
+        exprs_raw = [sort_or_default(expr) for expr in exprs]
         return DataFrame(self.df.sort(*exprs_raw))
 
     def cast(self, mapping: dict[str, pa.DataType[Any]]) -> DataFrame:
@@ -791,7 +757,7 @@ def join_on(
         Returns:
             DataFrame after join.
         """
-        exprs = [_ensure_expr(expr) for expr in on_exprs]
+        exprs = [expr.expr for expr in on_exprs]
         return DataFrame(self.df.join_on(right.df, exprs, how))
 
     def explain(self, verbose: bool = False, analyze: bool = False) -> None: