From 584afd70ef0fa90610840a3fbf939cfff62bd2ab Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 23 May 2025 10:57:18 +0800 Subject: [PATCH 1/8] feat: add API reference documentation for DataFrame and index --- docs/source/api/dataframe.rst | 374 ++++++++++++++++++++++++++++++++++ docs/source/api/index.rst | 27 +++ docs/source/index.rst | 2 + 3 files changed, 403 insertions(+) create mode 100644 docs/source/api/dataframe.rst create mode 100644 docs/source/api/index.rst diff --git a/docs/source/api/dataframe.rst b/docs/source/api/dataframe.rst new file mode 100644 index 000000000..5c5394071 --- /dev/null +++ b/docs/source/api/dataframe.rst @@ -0,0 +1,374 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +================= +DataFrame API +================= + +Overview +-------- + +The ``DataFrame`` class is the core abstraction in DataFusion that represents tabular data and operations +on that data. DataFrames provide a flexible API for transforming data through various operations such as +filtering, projection, aggregation, joining, and more. + +A DataFrame represents a logical plan that is lazily evaluated. The actual execution occurs only when +terminal operations like ``collect()``, ``show()``, or ``to_pandas()`` are called. + +Creating DataFrames +------------------ + +DataFrames can be created in several ways: + +* From SQL queries via a ``SessionContext``: + + .. code-block:: python + + from datafusion import SessionContext + + ctx = SessionContext() + df = ctx.sql("SELECT * FROM your_table") + +* From registered tables: + + .. code-block:: python + + df = ctx.table("your_table") + +* From various data sources: + + .. code-block:: python + + # From CSV files + df = ctx.read_csv("path/to/data.csv") + + # From Parquet files + df = ctx.read_parquet("path/to/data.parquet") + + # From JSON files + df = ctx.read_json("path/to/data.json") + + # From Pandas DataFrame + import pandas as pd + pandas_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df = ctx.from_pandas(pandas_df) + + # From Arrow data + import pyarrow as pa + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], + names=["a", "b"] + ) + df = ctx.from_arrow(batch) + +Common DataFrame Operations +-------------------------- + +DataFusion's DataFrame API offers a wide range of operations: + +.. code-block:: python + + from datafusion import column, literal + + # Select specific columns + df = df.select("col1", "col2") + + # Select with expressions + df = df.select(column("a") + column("b"), column("a") - column("b")) + + # Filter rows + df = df.filter(column("age") > literal(25)) + + # Add computed columns + df = df.with_column("full_name", column("first_name") + literal(" ") + column("last_name")) + + # Multiple column additions + df = df.with_columns( + (column("a") + column("b")).alias("sum"), + (column("a") * column("b")).alias("product") + ) + + # Sort data + df = df.sort(column("age").sort(ascending=False)) + + # Join DataFrames + df = df1.join(df2, on="user_id", how="inner") + + # Aggregate data + from datafusion import functions as f + df = df.aggregate( + [], # Group by columns (empty for global aggregation) + [f.sum(column("amount")).alias("total_amount")] + ) + + # Limit rows + df = df.limit(100) + + # Drop columns + df = df.drop("temporary_column") + +Terminal Operations +------------------ + +To materialize the results of your DataFrame operations: + +.. code-block:: python + + # Collect all data as PyArrow RecordBatches + result_batches = df.collect() + + # Convert to various formats + pandas_df = df.to_pandas() # Pandas DataFrame + polars_df = df.to_polars() # Polars DataFrame + arrow_table = df.to_arrow_table() # PyArrow Table + py_dict = df.to_pydict() # Python dictionary + py_list = df.to_pylist() # Python list of dictionaries + + # Display results + df.show() # Print tabular format to console + + # Count rows + count = df.count() + +HTML Rendering in Jupyter +------------------------ + +When working in Jupyter notebooks or other environments that support rich HTML display, +DataFusion DataFrames automatically render as nicely formatted HTML tables. This functionality +is provided by the ``_repr_html_`` method, which is automatically called by Jupyter. + +Basic HTML Rendering +~~~~~~~~~~~~~~~~~~~ + +In a Jupyter environment, simply displaying a DataFrame object will trigger HTML rendering: + +.. code-block:: python + + # Will display as HTML table in Jupyter + df + + # Explicit display also uses HTML rendering + display(df) + +HTML Rendering Customization +--------------------------- + +DataFusion provides extensive customization options for HTML table rendering through the +``datafusion.html_formatter`` module. + +Configuring the HTML Formatter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can customize how DataFrames are rendered by configuring the formatter: + +.. code-block:: python + + from datafusion.html_formatter import configure_formatter + + configure_formatter( + max_cell_length=30, # Maximum length of cell content before truncation + max_width=800, # Maximum width of table in pixels + max_height=400, # Maximum height of table in pixels + max_memory_bytes=2 * 1024 * 1024,# Maximum memory used for rendering (2MB) + min_rows_display=10, # Minimum rows to display + repr_rows=20, # Number of rows to display in representation + enable_cell_expansion=True, # Allow cells to be expandable on click + custom_css=None, # Custom CSS to apply + show_truncation_message=True, # Show message when data is truncated + style_provider=None, # Custom style provider class + use_shared_styles=True # Share styles across tables to reduce duplication + ) + +Custom Style Providers +~~~~~~~~~~~~~~~~~~~~~ + +For advanced styling needs, you can create a custom style provider class: + +.. code-block:: python + + from datafusion.html_formatter import configure_formatter + + class CustomStyleProvider: + def get_cell_style(self) -> str: + return "background-color: #f5f5f5; color: #333; padding: 8px; border: 1px solid #ddd;" + + def get_header_style(self) -> str: + return "background-color: #4285f4; color: white; font-weight: bold; padding: 10px;" + + # Apply custom styling + configure_formatter(style_provider=CustomStyleProvider()) + +Custom Type Formatters +~~~~~~~~~~~~~~~~~~~~~ + +You can register custom formatters for specific data types: + +.. code-block:: python + + from datafusion.html_formatter import get_formatter + + formatter = get_formatter() + + # Format integers with color based on value + def format_int(value): + return f' 100 else "blue"}">{value}' + + formatter.register_formatter(int, format_int) + + # Format date values + def format_date(value): + return f'{value.isoformat()}' + + formatter.register_formatter(datetime.date, format_date) + +Custom Cell Builders +~~~~~~~~~~~~~~~~~~~ + +For complete control over cell rendering: + +.. code-block:: python + + formatter = get_formatter() + + def custom_cell_builder(value, row, col, table_id): + try: + num_value = float(value) + if num_value > 0: # Positive values get green + return f'{value}' + if num_value < 0: # Negative values get red + return f'{value}' + except (ValueError, TypeError): + pass + + # Default styling for non-numeric or zero values + return f'{value}' + + formatter.set_custom_cell_builder(custom_cell_builder) + +Custom Header Builders +~~~~~~~~~~~~~~~~~~~~~ + +Similarly, you can customize the rendering of table headers: + +.. code-block:: python + + def custom_header_builder(field): + tooltip = f"Type: {field.type}" + return f'{field.name}' + + formatter.set_custom_header_builder(custom_header_builder) + +Managing Formatter State +----------------------- + +The HTML formatter maintains global state that can be managed: + +.. code-block:: python + + from datafusion.html_formatter import reset_formatter, reset_styles_loaded_state, get_formatter + + # Reset the formatter to default settings + reset_formatter() + + # Reset only the styles loaded state (useful when styles were loaded but need reloading) + reset_styles_loaded_state() + + # Get the current formatter instance to make changes + formatter = get_formatter() + +Advanced Example: Dashboard-Style Formatting +------------------------------------------ + +This example shows how to create a dashboard-like styling for your DataFrames: + +.. code-block:: python + + from datafusion.html_formatter import configure_formatter, get_formatter + + # Define custom CSS + custom_css = """ + .datafusion-table { + font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; + border-collapse: collapse; + width: 100%; + box-shadow: 0 2px 3px rgba(0,0,0,0.1); + } + .datafusion-table th { + position: sticky; + top: 0; + z-index: 10; + } + .datafusion-table tr:hover td { + background-color: #f1f7fa !important; + } + .datafusion-table .numeric-positive { + color: #0a7c00; + } + .datafusion-table .numeric-negative { + color: #d13438; + } + """ + + class DashboardStyleProvider: + def get_cell_style(self) -> str: + return "padding: 8px 12px; border-bottom: 1px solid #e0e0e0;" + + def get_header_style(self) -> str: + return ("background-color: #0078d4; color: white; font-weight: 600; " + "padding: 12px; text-align: left; border-bottom: 2px solid #005a9e;") + + # Apply configuration + configure_formatter( + max_height=500, + enable_cell_expansion=True, + custom_css=custom_css, + style_provider=DashboardStyleProvider(), + max_cell_length=50 + ) + + # Add custom formatters for numbers + formatter = get_formatter() + + def format_number(value): + try: + num = float(value) + cls = "numeric-positive" if num > 0 else "numeric-negative" if num < 0 else "" + return f'{value:,}' if cls else f'{value:,}' + except (ValueError, TypeError): + return str(value) + + formatter.register_formatter(int, format_number) + formatter.register_formatter(float, format_number) + +Best Practices +------------- + +1. **Memory Management**: For large datasets, use ``max_memory_bytes`` to limit memory usage. + +2. **Responsive Design**: Set reasonable ``max_width`` and ``max_height`` values to ensure tables display well on different screens. + +3. **Style Optimization**: Use ``use_shared_styles=True`` to avoid duplicate style definitions when displaying multiple tables. + +4. **Reset When Needed**: Call ``reset_formatter()`` when you want to start fresh with default settings. + +5. **Cell Expansion**: Use ``enable_cell_expansion=True`` when cells might contain longer content that users may want to see in full. + +Additional Resources +------------------- + +* `DataFusion User Guide <../user-guide/dataframe.html>`_ - Complete guide to using DataFrames +* `API Reference `_ - Full API reference diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst new file mode 100644 index 000000000..7f58227ca --- /dev/null +++ b/docs/source/api/index.rst @@ -0,0 +1,27 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +============= +API Reference +============= + +This section provides detailed API documentation for the DataFusion Python library. + +.. toctree:: + :maxdepth: 2 + + dataframe diff --git a/docs/source/index.rst b/docs/source/index.rst index c18793822..ff1e47280 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -93,3 +93,5 @@ Example :hidden: :maxdepth: 1 :caption: API + + api/index From f544348e252a46a3eb66da3d17be42d1a8a86375 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 23 May 2025 10:59:37 +0800 Subject: [PATCH 2/8] feat: add tests for validating RST syntax, code blocks, and internal links in DataFrame API documentation --- docs/source/api/test_dataframe.rst | 198 +++++++++++++++++++++++++++++ 1 file changed, 198 insertions(+) create mode 100644 docs/source/api/test_dataframe.rst diff --git a/docs/source/api/test_dataframe.rst b/docs/source/api/test_dataframe.rst new file mode 100644 index 000000000..90b38d647 --- /dev/null +++ b/docs/source/api/test_dataframe.rst @@ -0,0 +1,198 @@ +# docs/source/api/test_dataframe.py +"""Tests for the DataFrame API documentation in RST format. + +This script validates the RST syntax, links, and structure of the DataFrame +API documentation files. +""" + +import os +import sys +from pathlib import Path +from typing import List, Tuple + +from docutils.core import publish_doctree, publish_parts +from docutils.parsers.rst import Parser +from docutils.utils import new_document, SystemMessage + +def test_rst_syntax(file_path: str) -> List[SystemMessage]: + """Test if the RST file has valid syntax. + + Args: + file_path: Path to the RST file to test + + Returns: + List of error messages, empty if no errors + """ + with open(file_path, "r", encoding="utf-8") as rst_file: + content = rst_file.read() + + parser = Parser() + settings = {} + document = new_document("test", settings) + + # Parse the document and capture any errors/warnings + parser.parse(content, document) + + return [msg for msg in document.traverse(condition=SystemMessage)] + + +def test_build_rst(file_path: str) -> Tuple[bool, List[str]]: + """Test if the RST file can be built into HTML without errors. + + Args: + file_path: Path to the RST file to test + + Returns: + Tuple containing (success status, list of error messages) + """ + with open(file_path, "r", encoding="utf-8") as rst_file: + content = rst_file.read() + + try: + # Try to build the document to HTML + publish_parts( + source=content, + writer_name="html5", + settings_overrides={"halt_level": 2} # Stop at warning level + ) + return True, [] + except Exception as e: + return False, [str(e)] + + +def test_code_blocks(file_path: str) -> List[str]: + """Test if code blocks in the RST file are properly formatted. + + Args: + file_path: Path to the RST file to test + + Returns: + List of error messages, empty if no errors + """ + with open(file_path, "r", encoding="utf-8") as rst_file: + content = rst_file.read() + + errors = [] + lines = content.split("\n") + in_code_block = False + code_block_indent = 0 + + for i, line in enumerate(lines, 1): + if ".. code-block::" in line: + in_code_block = True + code_block_indent = len(line) - len(line.lstrip()) + elif in_code_block and line.strip() and not line.startswith(" " * (code_block_indent + 4)): + # Code block content should be indented by at least 4 spaces + if not line.strip().startswith(".. "): # Skip RST directives + errors.append(f"Line {i}: Code block not properly indented") + in_code_block = False + elif in_code_block and not line.strip(): + # Empty line within code block, still in code block + pass + elif in_code_block: + # If line doesn't start with proper indentation, we're out of the code block + if not line.startswith(" " * (code_block_indent + 4)): + in_code_block = False + + return errors + + +def test_internal_links(file_path: str) -> List[str]: + """Test if internal links in the RST file point to valid sections. + + Args: + file_path: Path to the RST file to test + + Returns: + List of error messages, empty if no errors + """ + with open(file_path, "r", encoding="utf-8") as rst_file: + content = rst_file.read() + + errors = [] + + # Extract section titles + section_titles = [] + lines = content.split("\n") + for i, line in enumerate(lines): + if i > 0 and len(lines[i-1].strip()) > 0: + if all(c == "=" for c in line.strip()) or all(c == "-" for c in line.strip()) or all(c == "~" for c in line.strip()): + section_titles.append(lines[i-1].strip()) + + # Check if internal links point to valid sections + tree = publish_doctree(content) + for node in tree.traverse(): + if node.tagname == "reference" and "refuri" in node.attributes: + ref_uri = node.attributes["refuri"] + if ref_uri.startswith("#"): + link_target = ref_uri[1:] + # Normalize target by removing spaces and converting to lowercase + normalized_target = link_target.lower().replace(" ", "-") + # Check if target exists in section titles + found = False + for title in section_titles: + if normalized_target == title.lower().replace(" ", "-"): + found = True + break + if not found: + errors.append(f"Internal link to '#{link_target}' does not match any section title") + + return errors + + +def main(): + """Run all tests on the DataFrame RST documentation.""" + # Get the path to the RST file + current_dir = Path(os.path.dirname(os.path.abspath(__file__))) + dataframe_rst_path = current_dir / "dataframe.rst" + + if not dataframe_rst_path.exists(): + print(f"Error: File not found: {dataframe_rst_path}") + return 1 + + # Run tests + print(f"Testing {dataframe_rst_path}...") + + syntax_errors = test_rst_syntax(str(dataframe_rst_path)) + if syntax_errors: + print("RST syntax errors found:") + for error in syntax_errors: + print(f" - {error}") + else: + print("✓ RST syntax is valid") + + code_block_errors = test_code_blocks(str(dataframe_rst_path)) + if code_block_errors: + print("Code block errors found:") + for error in code_block_errors: + print(f" - {error}") + else: + print("✓ Code blocks are valid") + + link_errors = test_internal_links(str(dataframe_rst_path)) + if link_errors: + print("Internal link errors found:") + for error in link_errors: + print(f" - {error}") + else: + print("✓ Internal links are valid") + + build_success, build_errors = test_build_rst(str(dataframe_rst_path)) + if not build_success: + print("Build errors found:") + for error in build_errors: + print(f" - {error}") + else: + print("✓ Document builds successfully") + + # Overall result + if syntax_errors or code_block_errors or link_errors or not build_success: + print("\n❌ Tests failed") + return 1 + else: + print("\n✅ All tests passed") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file From 962f9ce497d0fc0367fcd0f2d3a61eba95138631 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 23 May 2025 11:05:42 +0800 Subject: [PATCH 3/8] refactor: remove test script for DataFrame API documentation in RST format --- docs/source/api/test_dataframe.rst | 198 ----------------------------- 1 file changed, 198 deletions(-) delete mode 100644 docs/source/api/test_dataframe.rst diff --git a/docs/source/api/test_dataframe.rst b/docs/source/api/test_dataframe.rst deleted file mode 100644 index 90b38d647..000000000 --- a/docs/source/api/test_dataframe.rst +++ /dev/null @@ -1,198 +0,0 @@ -# docs/source/api/test_dataframe.py -"""Tests for the DataFrame API documentation in RST format. - -This script validates the RST syntax, links, and structure of the DataFrame -API documentation files. -""" - -import os -import sys -from pathlib import Path -from typing import List, Tuple - -from docutils.core import publish_doctree, publish_parts -from docutils.parsers.rst import Parser -from docutils.utils import new_document, SystemMessage - -def test_rst_syntax(file_path: str) -> List[SystemMessage]: - """Test if the RST file has valid syntax. - - Args: - file_path: Path to the RST file to test - - Returns: - List of error messages, empty if no errors - """ - with open(file_path, "r", encoding="utf-8") as rst_file: - content = rst_file.read() - - parser = Parser() - settings = {} - document = new_document("test", settings) - - # Parse the document and capture any errors/warnings - parser.parse(content, document) - - return [msg for msg in document.traverse(condition=SystemMessage)] - - -def test_build_rst(file_path: str) -> Tuple[bool, List[str]]: - """Test if the RST file can be built into HTML without errors. - - Args: - file_path: Path to the RST file to test - - Returns: - Tuple containing (success status, list of error messages) - """ - with open(file_path, "r", encoding="utf-8") as rst_file: - content = rst_file.read() - - try: - # Try to build the document to HTML - publish_parts( - source=content, - writer_name="html5", - settings_overrides={"halt_level": 2} # Stop at warning level - ) - return True, [] - except Exception as e: - return False, [str(e)] - - -def test_code_blocks(file_path: str) -> List[str]: - """Test if code blocks in the RST file are properly formatted. - - Args: - file_path: Path to the RST file to test - - Returns: - List of error messages, empty if no errors - """ - with open(file_path, "r", encoding="utf-8") as rst_file: - content = rst_file.read() - - errors = [] - lines = content.split("\n") - in_code_block = False - code_block_indent = 0 - - for i, line in enumerate(lines, 1): - if ".. code-block::" in line: - in_code_block = True - code_block_indent = len(line) - len(line.lstrip()) - elif in_code_block and line.strip() and not line.startswith(" " * (code_block_indent + 4)): - # Code block content should be indented by at least 4 spaces - if not line.strip().startswith(".. "): # Skip RST directives - errors.append(f"Line {i}: Code block not properly indented") - in_code_block = False - elif in_code_block and not line.strip(): - # Empty line within code block, still in code block - pass - elif in_code_block: - # If line doesn't start with proper indentation, we're out of the code block - if not line.startswith(" " * (code_block_indent + 4)): - in_code_block = False - - return errors - - -def test_internal_links(file_path: str) -> List[str]: - """Test if internal links in the RST file point to valid sections. - - Args: - file_path: Path to the RST file to test - - Returns: - List of error messages, empty if no errors - """ - with open(file_path, "r", encoding="utf-8") as rst_file: - content = rst_file.read() - - errors = [] - - # Extract section titles - section_titles = [] - lines = content.split("\n") - for i, line in enumerate(lines): - if i > 0 and len(lines[i-1].strip()) > 0: - if all(c == "=" for c in line.strip()) or all(c == "-" for c in line.strip()) or all(c == "~" for c in line.strip()): - section_titles.append(lines[i-1].strip()) - - # Check if internal links point to valid sections - tree = publish_doctree(content) - for node in tree.traverse(): - if node.tagname == "reference" and "refuri" in node.attributes: - ref_uri = node.attributes["refuri"] - if ref_uri.startswith("#"): - link_target = ref_uri[1:] - # Normalize target by removing spaces and converting to lowercase - normalized_target = link_target.lower().replace(" ", "-") - # Check if target exists in section titles - found = False - for title in section_titles: - if normalized_target == title.lower().replace(" ", "-"): - found = True - break - if not found: - errors.append(f"Internal link to '#{link_target}' does not match any section title") - - return errors - - -def main(): - """Run all tests on the DataFrame RST documentation.""" - # Get the path to the RST file - current_dir = Path(os.path.dirname(os.path.abspath(__file__))) - dataframe_rst_path = current_dir / "dataframe.rst" - - if not dataframe_rst_path.exists(): - print(f"Error: File not found: {dataframe_rst_path}") - return 1 - - # Run tests - print(f"Testing {dataframe_rst_path}...") - - syntax_errors = test_rst_syntax(str(dataframe_rst_path)) - if syntax_errors: - print("RST syntax errors found:") - for error in syntax_errors: - print(f" - {error}") - else: - print("✓ RST syntax is valid") - - code_block_errors = test_code_blocks(str(dataframe_rst_path)) - if code_block_errors: - print("Code block errors found:") - for error in code_block_errors: - print(f" - {error}") - else: - print("✓ Code blocks are valid") - - link_errors = test_internal_links(str(dataframe_rst_path)) - if link_errors: - print("Internal link errors found:") - for error in link_errors: - print(f" - {error}") - else: - print("✓ Internal links are valid") - - build_success, build_errors = test_build_rst(str(dataframe_rst_path)) - if not build_success: - print("Build errors found:") - for error in build_errors: - print(f" - {error}") - else: - print("✓ Document builds successfully") - - # Overall result - if syntax_errors or code_block_errors or link_errors or not build_success: - print("\n❌ Tests failed") - return 1 - else: - print("\n✅ All tests passed") - return 0 - - -if __name__ == "__main__": - sys.exit(main()) \ No newline at end of file From dca035c1cfc1558060abd5d7072758668b79954b Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 2 Jun 2025 16:18:10 +0800 Subject: [PATCH 4/8] fix: correct formatting inconsistencies in dataframe.rst --- docs/source/api/dataframe.rst | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/source/api/dataframe.rst b/docs/source/api/dataframe.rst index 5c5394071..83abc1a00 100644 --- a/docs/source/api/dataframe.rst +++ b/docs/source/api/dataframe.rst @@ -30,7 +30,7 @@ A DataFrame represents a logical plan that is lazily evaluated. The actual execu terminal operations like ``collect()``, ``show()``, or ``to_pandas()`` are called. Creating DataFrames ------------------- +------------------- DataFrames can be created in several ways: @@ -76,7 +76,7 @@ DataFrames can be created in several ways: df = ctx.from_arrow(batch) Common DataFrame Operations --------------------------- +--------------------------- DataFusion's DataFrame API offers a wide range of operations: @@ -122,7 +122,7 @@ DataFusion's DataFrame API offers a wide range of operations: df = df.drop("temporary_column") Terminal Operations ------------------- +------------------- To materialize the results of your DataFrame operations: @@ -145,14 +145,14 @@ To materialize the results of your DataFrame operations: count = df.count() HTML Rendering in Jupyter ------------------------- +------------------------- When working in Jupyter notebooks or other environments that support rich HTML display, DataFusion DataFrames automatically render as nicely formatted HTML tables. This functionality is provided by the ``_repr_html_`` method, which is automatically called by Jupyter. Basic HTML Rendering -~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~ In a Jupyter environment, simply displaying a DataFrame object will trigger HTML rendering: @@ -165,13 +165,13 @@ In a Jupyter environment, simply displaying a DataFrame object will trigger HTML display(df) HTML Rendering Customization ---------------------------- +---------------------------- DataFusion provides extensive customization options for HTML table rendering through the ``datafusion.html_formatter`` module. Configuring the HTML Formatter -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ You can customize how DataFrames are rendered by configuring the formatter: @@ -194,7 +194,7 @@ You can customize how DataFrames are rendered by configuring the formatter: ) Custom Style Providers -~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~ For advanced styling needs, you can create a custom style provider class: @@ -213,7 +213,7 @@ For advanced styling needs, you can create a custom style provider class: configure_formatter(style_provider=CustomStyleProvider()) Custom Type Formatters -~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~ You can register custom formatters for specific data types: @@ -236,7 +236,7 @@ You can register custom formatters for specific data types: formatter.register_formatter(datetime.date, format_date) Custom Cell Builders -~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~ For complete control over cell rendering: @@ -260,7 +260,7 @@ For complete control over cell rendering: formatter.set_custom_cell_builder(custom_cell_builder) Custom Header Builders -~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~ Similarly, you can customize the rendering of table headers: @@ -273,7 +273,7 @@ Similarly, you can customize the rendering of table headers: formatter.set_custom_header_builder(custom_header_builder) Managing Formatter State ------------------------ +-----------------------~ The HTML formatter maintains global state that can be managed: @@ -291,7 +291,7 @@ The HTML formatter maintains global state that can be managed: formatter = get_formatter() Advanced Example: Dashboard-Style Formatting ------------------------------------------- +------------------------------------------~~ This example shows how to create a dashboard-like styling for your DataFrames: @@ -355,7 +355,7 @@ This example shows how to create a dashboard-like styling for your DataFrames: formatter.register_formatter(float, format_number) Best Practices -------------- +-------------- 1. **Memory Management**: For large datasets, use ``max_memory_bytes`` to limit memory usage. @@ -368,7 +368,7 @@ Best Practices 5. **Cell Expansion**: Use ``enable_cell_expansion=True`` when cells might contain longer content that users may want to see in full. Additional Resources -------------------- +-------------------- * `DataFusion User Guide <../user-guide/dataframe.html>`_ - Complete guide to using DataFrames * `API Reference `_ - Full API reference From d7d67fb8c0f0e3dce32aa654798217a6cf516690 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 3 Jun 2025 09:21:21 +0800 Subject: [PATCH 5/8] fix: correct header formatting in functions.rst --- docs/source/user-guide/common-operations/functions.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/user-guide/common-operations/functions.rst b/docs/source/user-guide/common-operations/functions.rst index d458d3eb0..ccb47a4e7 100644 --- a/docs/source/user-guide/common-operations/functions.rst +++ b/docs/source/user-guide/common-operations/functions.rst @@ -132,7 +132,7 @@ The function :py:func:`~datafusion.functions.in_list` allows to check a column f Handling Missing Values -===================== +======================= DataFusion provides methods to handle missing values in DataFrames: From c4c8d142fedd97c9e7ea66c465d6c960df3b7359 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 3 Jun 2025 09:40:34 +0800 Subject: [PATCH 6/8] fix: adjust formatting for code block in dataframe.rst --- docs/source/user-guide/dataframe.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/user-guide/dataframe.rst b/docs/source/user-guide/dataframe.rst index 11e3d7e72..23c65b5f6 100644 --- a/docs/source/user-guide/dataframe.rst +++ b/docs/source/user-guide/dataframe.rst @@ -122,7 +122,8 @@ Performance Optimization with Shared Styles The ``use_shared_styles`` parameter (enabled by default) optimizes performance when displaying multiple DataFrames in notebook environments: - .. code-block:: python +.. code-block:: python + from datafusion.html_formatter import StyleProvider, configure_formatter # Default: Use shared styles (recommended for notebooks) configure_formatter(use_shared_styles=True) From 391bfe4d750f26d1b28825dce3eaec6c0ba86a14 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 3 Jun 2025 09:51:42 +0800 Subject: [PATCH 7/8] fix: skip documentation for duplicate modules in autoapi configuration --- docs/source/conf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index 0ca124fd1..28db17d35 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -80,6 +80,9 @@ def autoapi_skip_member_fn(app, what, name, obj, skip, options) -> bool: # noqa ("class", "datafusion.DataFrame"), ("class", "datafusion.SessionContext"), ("module", "datafusion.common"), + # Duplicate modules (skip module-level docs to avoid duplication) + ("module", "datafusion.col"), + ("module", "datafusion.udf"), # Deprecated ("class", "datafusion.substrait.serde"), ("class", "datafusion.substrait.plan"), From ce9dd9171c056b4020de0ea568d7ddff84ea59b8 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 3 Jun 2025 10:22:15 +0800 Subject: [PATCH 8/8] fix: add cross reference to io pages --- docs/source/api/dataframe.rst | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/docs/source/api/dataframe.rst b/docs/source/api/dataframe.rst index 83abc1a00..a9e9e47c8 100644 --- a/docs/source/api/dataframe.rst +++ b/docs/source/api/dataframe.rst @@ -53,15 +53,18 @@ DataFrames can be created in several ways: .. code-block:: python - # From CSV files + # From CSV files (see :ref:`io_csv` for detailed options) df = ctx.read_csv("path/to/data.csv") - # From Parquet files + # From Parquet files (see :ref:`io_parquet` for detailed options) df = ctx.read_parquet("path/to/data.parquet") - # From JSON files + # From JSON files (see :ref:`io_json` for detailed options) df = ctx.read_json("path/to/data.json") + # From Avro files (see :ref:`io_avro` for detailed options) + df = ctx.read_avro("path/to/data.avro") + # From Pandas DataFrame import pandas as pd pandas_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -75,6 +78,9 @@ DataFrames can be created in several ways: ) df = ctx.from_arrow(batch) + For detailed information about reading from different data sources, see the :doc:`I/O Guide <../user-guide/io/index>`. + For custom data sources, see :ref:`io_custom_table_provider`. + Common DataFrame Operations --------------------------- @@ -370,5 +376,12 @@ Best Practices Additional Resources -------------------- -* `DataFusion User Guide <../user-guide/dataframe.html>`_ - Complete guide to using DataFrames +* :doc:`../user-guide/dataframe` - Complete guide to using DataFrames +* :doc:`../user-guide/io/index` - I/O Guide for reading data from various sources +* :doc:`../user-guide/data-sources` - Comprehensive data sources guide +* :ref:`io_csv` - CSV file reading +* :ref:`io_parquet` - Parquet file reading +* :ref:`io_json` - JSON file reading +* :ref:`io_avro` - Avro file reading +* :ref:`io_custom_table_provider` - Custom table providers * `API Reference `_ - Full API reference