From 584afd70ef0fa90610840a3fbf939cfff62bd2ab Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 23 May 2025 10:57:18 +0800
Subject: [PATCH 1/8] feat: add API reference documentation for DataFrame and
 index

---
 docs/source/api/dataframe.rst | 374 ++++++++++++++++++++++++++++++++++
 docs/source/api/index.rst     |  27 +++
 docs/source/index.rst         |   2 +
 3 files changed, 403 insertions(+)
 create mode 100644 docs/source/api/dataframe.rst
 create mode 100644 docs/source/api/index.rst

diff --git a/docs/source/api/dataframe.rst b/docs/source/api/dataframe.rst
new file mode 100644
index 000000000..5c5394071
--- /dev/null
+++ b/docs/source/api/dataframe.rst
@@ -0,0 +1,374 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+=================
+DataFrame API
+=================
+
+Overview
+--------
+
+The ``DataFrame`` class is the core abstraction in DataFusion that represents tabular data and operations
+on that data. DataFrames provide a flexible API for transforming data through various operations such as
+filtering, projection, aggregation, joining, and more.
+
+A DataFrame represents a logical plan that is lazily evaluated. The actual execution occurs only when 
+terminal operations like ``collect()``, ``show()``, or ``to_pandas()`` are called.
+
+Creating DataFrames
+------------------
+
+DataFrames can be created in several ways:
+
+* From SQL queries via a ``SessionContext``:
+
+  .. code-block:: python
+
+      from datafusion import SessionContext
+      
+      ctx = SessionContext()
+      df = ctx.sql("SELECT * FROM your_table")
+
+* From registered tables:
+
+  .. code-block:: python
+
+      df = ctx.table("your_table")
+
+* From various data sources:
+
+  .. code-block:: python
+
+      # From CSV files
+      df = ctx.read_csv("path/to/data.csv")
+      
+      # From Parquet files
+      df = ctx.read_parquet("path/to/data.parquet")
+      
+      # From JSON files
+      df = ctx.read_json("path/to/data.json")
+      
+      # From Pandas DataFrame
+      import pandas as pd
+      pandas_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+      df = ctx.from_pandas(pandas_df)
+      
+      # From Arrow data
+      import pyarrow as pa
+      batch = pa.RecordBatch.from_arrays(
+          [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+          names=["a", "b"]
+      )
+      df = ctx.from_arrow(batch)
+
+Common DataFrame Operations
+--------------------------
+
+DataFusion's DataFrame API offers a wide range of operations:
+
+.. code-block:: python
+
+    from datafusion import column, literal
+    
+    # Select specific columns
+    df = df.select("col1", "col2")
+    
+    # Select with expressions
+    df = df.select(column("a") + column("b"), column("a") - column("b"))
+    
+    # Filter rows
+    df = df.filter(column("age") > literal(25))
+    
+    # Add computed columns
+    df = df.with_column("full_name", column("first_name") + literal(" ") + column("last_name"))
+    
+    # Multiple column additions
+    df = df.with_columns(
+        (column("a") + column("b")).alias("sum"),
+        (column("a") * column("b")).alias("product")
+    )
+    
+    # Sort data
+    df = df.sort(column("age").sort(ascending=False))
+    
+    # Join DataFrames
+    df = df1.join(df2, on="user_id", how="inner")
+    
+    # Aggregate data
+    from datafusion import functions as f
+    df = df.aggregate(
+        [],  # Group by columns (empty for global aggregation)
+        [f.sum(column("amount")).alias("total_amount")]
+    )
+    
+    # Limit rows
+    df = df.limit(100)
+    
+    # Drop columns
+    df = df.drop("temporary_column")
+
+Terminal Operations
+------------------
+
+To materialize the results of your DataFrame operations:
+
+.. code-block:: python
+
+    # Collect all data as PyArrow RecordBatches
+    result_batches = df.collect()
+    
+    # Convert to various formats
+    pandas_df = df.to_pandas()        # Pandas DataFrame
+    polars_df = df.to_polars()        # Polars DataFrame
+    arrow_table = df.to_arrow_table() # PyArrow Table
+    py_dict = df.to_pydict()          # Python dictionary
+    py_list = df.to_pylist()          # Python list of dictionaries
+    
+    # Display results
+    df.show()                         # Print tabular format to console
+    
+    # Count rows
+    count = df.count()
+
+HTML Rendering in Jupyter
+------------------------
+
+When working in Jupyter notebooks or other environments that support rich HTML display, 
+DataFusion DataFrames automatically render as nicely formatted HTML tables. This functionality
+is provided by the ``_repr_html_`` method, which is automatically called by Jupyter.
+
+Basic HTML Rendering
+~~~~~~~~~~~~~~~~~~~
+
+In a Jupyter environment, simply displaying a DataFrame object will trigger HTML rendering:
+
+.. code-block:: python
+
+    # Will display as HTML table in Jupyter
+    df
+
+    # Explicit display also uses HTML rendering
+    display(df)
+
+HTML Rendering Customization
+---------------------------
+
+DataFusion provides extensive customization options for HTML table rendering through the
+``datafusion.html_formatter`` module.
+
+Configuring the HTML Formatter
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can customize how DataFrames are rendered by configuring the formatter:
+
+.. code-block:: python
+
+    from datafusion.html_formatter import configure_formatter
+    
+    configure_formatter(
+        max_cell_length=30,              # Maximum length of cell content before truncation
+        max_width=800,                   # Maximum width of table in pixels
+        max_height=400,                  # Maximum height of table in pixels
+        max_memory_bytes=2 * 1024 * 1024,# Maximum memory used for rendering (2MB)
+        min_rows_display=10,             # Minimum rows to display
+        repr_rows=20,                    # Number of rows to display in representation
+        enable_cell_expansion=True,      # Allow cells to be expandable on click
+        custom_css=None,                 # Custom CSS to apply
+        show_truncation_message=True,    # Show message when data is truncated
+        style_provider=None,             # Custom style provider class
+        use_shared_styles=True           # Share styles across tables to reduce duplication
+    )
+
+Custom Style Providers
+~~~~~~~~~~~~~~~~~~~~~
+
+For advanced styling needs, you can create a custom style provider class:
+
+.. code-block:: python
+
+    from datafusion.html_formatter import configure_formatter
+    
+    class CustomStyleProvider:
+        def get_cell_style(self) -> str:
+            return "background-color: #f5f5f5; color: #333; padding: 8px; border: 1px solid #ddd;"
+    
+        def get_header_style(self) -> str:
+            return "background-color: #4285f4; color: white; font-weight: bold; padding: 10px;"
+    
+    # Apply custom styling
+    configure_formatter(style_provider=CustomStyleProvider())
+
+Custom Type Formatters
+~~~~~~~~~~~~~~~~~~~~~
+
+You can register custom formatters for specific data types:
+
+.. code-block:: python
+
+    from datafusion.html_formatter import get_formatter
+    
+    formatter = get_formatter()
+    
+    # Format integers with color based on value
+    def format_int(value):
+        return f'<span style="color: {"red" if value > 100 else "blue"}">{value}</span>'
+    
+    formatter.register_formatter(int, format_int)
+    
+    # Format date values
+    def format_date(value):
+        return f'<span class="date-value">{value.isoformat()}</span>'
+    
+    formatter.register_formatter(datetime.date, format_date)
+
+Custom Cell Builders
+~~~~~~~~~~~~~~~~~~~
+
+For complete control over cell rendering:
+
+.. code-block:: python
+
+    formatter = get_formatter()
+    
+    def custom_cell_builder(value, row, col, table_id):
+        try:
+            num_value = float(value)
+            if num_value > 0:  # Positive values get green
+                return f'<td style="background-color: #d9f0d3">{value}</td>'
+            if num_value < 0:  # Negative values get red
+                return f'<td style="background-color: #f0d3d3">{value}</td>'
+        except (ValueError, TypeError):
+            pass
+        
+        # Default styling for non-numeric or zero values
+        return f'<td style="border: 1px solid #ddd">{value}</td>'
+    
+    formatter.set_custom_cell_builder(custom_cell_builder)
+
+Custom Header Builders
+~~~~~~~~~~~~~~~~~~~~~
+
+Similarly, you can customize the rendering of table headers:
+
+.. code-block:: python
+
+    def custom_header_builder(field):
+        tooltip = f"Type: {field.type}"
+        return f'<th style="background-color: #333; color: white" title="{tooltip}">{field.name}</th>'
+    
+    formatter.set_custom_header_builder(custom_header_builder)
+
+Managing Formatter State
+-----------------------
+
+The HTML formatter maintains global state that can be managed:
+
+.. code-block:: python
+
+    from datafusion.html_formatter import reset_formatter, reset_styles_loaded_state, get_formatter
+    
+    # Reset the formatter to default settings
+    reset_formatter()
+    
+    # Reset only the styles loaded state (useful when styles were loaded but need reloading)
+    reset_styles_loaded_state()
+    
+    # Get the current formatter instance to make changes
+    formatter = get_formatter()
+
+Advanced Example: Dashboard-Style Formatting
+------------------------------------------
+
+This example shows how to create a dashboard-like styling for your DataFrames:
+
+.. code-block:: python
+
+    from datafusion.html_formatter import configure_formatter, get_formatter
+    
+    # Define custom CSS
+    custom_css = """
+    .datafusion-table {
+        font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+        border-collapse: collapse;
+        width: 100%;
+        box-shadow: 0 2px 3px rgba(0,0,0,0.1);
+    }
+    .datafusion-table th {
+        position: sticky;
+        top: 0;
+        z-index: 10;
+    }
+    .datafusion-table tr:hover td {
+        background-color: #f1f7fa !important;
+    }
+    .datafusion-table .numeric-positive {
+        color: #0a7c00;
+    }
+    .datafusion-table .numeric-negative {
+        color: #d13438;
+    }
+    """
+    
+    class DashboardStyleProvider:
+        def get_cell_style(self) -> str:
+            return "padding: 8px 12px; border-bottom: 1px solid #e0e0e0;"
+        
+        def get_header_style(self) -> str:
+            return ("background-color: #0078d4; color: white; font-weight: 600; "
+                    "padding: 12px; text-align: left; border-bottom: 2px solid #005a9e;")
+    
+    # Apply configuration
+    configure_formatter(
+        max_height=500,
+        enable_cell_expansion=True,
+        custom_css=custom_css,
+        style_provider=DashboardStyleProvider(),
+        max_cell_length=50
+    )
+    
+    # Add custom formatters for numbers
+    formatter = get_formatter()
+    
+    def format_number(value):
+        try:
+            num = float(value)
+            cls = "numeric-positive" if num > 0 else "numeric-negative" if num < 0 else ""
+            return f'<span class="{cls}">{value:,}</span>' if cls else f'{value:,}'
+        except (ValueError, TypeError):
+            return str(value)
+    
+    formatter.register_formatter(int, format_number)
+    formatter.register_formatter(float, format_number)
+
+Best Practices
+-------------
+
+1. **Memory Management**: For large datasets, use ``max_memory_bytes`` to limit memory usage.
+
+2. **Responsive Design**: Set reasonable ``max_width`` and ``max_height`` values to ensure tables display well on different screens.
+
+3. **Style Optimization**: Use ``use_shared_styles=True`` to avoid duplicate style definitions when displaying multiple tables.
+
+4. **Reset When Needed**: Call ``reset_formatter()`` when you want to start fresh with default settings.
+
+5. **Cell Expansion**: Use ``enable_cell_expansion=True`` when cells might contain longer content that users may want to see in full.
+
+Additional Resources
+-------------------
+
+* `DataFusion User Guide <../user-guide/dataframe.html>`_ - Complete guide to using DataFrames
+* `API Reference <https://arrow.apache.org/datafusion-python/api/index.html>`_ - Full API reference
diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst
new file mode 100644
index 000000000..7f58227ca
--- /dev/null
+++ b/docs/source/api/index.rst
@@ -0,0 +1,27 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+=============
+API Reference
+=============
+
+This section provides detailed API documentation for the DataFusion Python library.
+
+.. toctree::
+   :maxdepth: 2
+   
+   dataframe
diff --git a/docs/source/index.rst b/docs/source/index.rst
index c18793822..ff1e47280 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -93,3 +93,5 @@ Example
    :hidden:
    :maxdepth: 1
    :caption: API
+   
+   api/index

From f544348e252a46a3eb66da3d17be42d1a8a86375 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 23 May 2025 10:59:37 +0800
Subject: [PATCH 2/8] feat: add tests for validating RST syntax, code blocks,
 and internal links in DataFrame API documentation

---
 docs/source/api/test_dataframe.rst | 198 +++++++++++++++++++++++++++++
 1 file changed, 198 insertions(+)
 create mode 100644 docs/source/api/test_dataframe.rst

diff --git a/docs/source/api/test_dataframe.rst b/docs/source/api/test_dataframe.rst
new file mode 100644
index 000000000..90b38d647
--- /dev/null
+++ b/docs/source/api/test_dataframe.rst
@@ -0,0 +1,198 @@
+# docs/source/api/test_dataframe.py
+"""Tests for the DataFrame API documentation in RST format.
+
+This script validates the RST syntax, links, and structure of the DataFrame
+API documentation files.
+"""
+
+import os
+import sys
+from pathlib import Path
+from typing import List, Tuple
+
+from docutils.core import publish_doctree, publish_parts
+from docutils.parsers.rst import Parser
+from docutils.utils import new_document, SystemMessage
+
+def test_rst_syntax(file_path: str) -> List[SystemMessage]:
+    """Test if the RST file has valid syntax.
+    
+    Args:
+        file_path: Path to the RST file to test
+        
+    Returns:
+        List of error messages, empty if no errors
+    """
+    with open(file_path, "r", encoding="utf-8") as rst_file:
+        content = rst_file.read()
+    
+    parser = Parser()
+    settings = {}
+    document = new_document("test", settings)
+    
+    # Parse the document and capture any errors/warnings
+    parser.parse(content, document)
+    
+    return [msg for msg in document.traverse(condition=SystemMessage)]
+
+
+def test_build_rst(file_path: str) -> Tuple[bool, List[str]]:
+    """Test if the RST file can be built into HTML without errors.
+    
+    Args:
+        file_path: Path to the RST file to test
+        
+    Returns:
+        Tuple containing (success status, list of error messages)
+    """
+    with open(file_path, "r", encoding="utf-8") as rst_file:
+        content = rst_file.read()
+    
+    try:
+        # Try to build the document to HTML
+        publish_parts(
+            source=content,
+            writer_name="html5",
+            settings_overrides={"halt_level": 2}  # Stop at warning level
+        )
+        return True, []
+    except Exception as e:
+        return False, [str(e)]
+
+
+def test_code_blocks(file_path: str) -> List[str]:
+    """Test if code blocks in the RST file are properly formatted.
+    
+    Args:
+        file_path: Path to the RST file to test
+        
+    Returns:
+        List of error messages, empty if no errors
+    """
+    with open(file_path, "r", encoding="utf-8") as rst_file:
+        content = rst_file.read()
+    
+    errors = []
+    lines = content.split("\n")
+    in_code_block = False
+    code_block_indent = 0
+    
+    for i, line in enumerate(lines, 1):
+        if ".. code-block::" in line:
+            in_code_block = True
+            code_block_indent = len(line) - len(line.lstrip())
+        elif in_code_block and line.strip() and not line.startswith(" " * (code_block_indent + 4)):
+            # Code block content should be indented by at least 4 spaces
+            if not line.strip().startswith(".. "):  # Skip RST directives
+                errors.append(f"Line {i}: Code block not properly indented")
+                in_code_block = False
+        elif in_code_block and not line.strip():
+            # Empty line within code block, still in code block
+            pass
+        elif in_code_block:
+            # If line doesn't start with proper indentation, we're out of the code block
+            if not line.startswith(" " * (code_block_indent + 4)):
+                in_code_block = False
+    
+    return errors
+
+
+def test_internal_links(file_path: str) -> List[str]:
+    """Test if internal links in the RST file point to valid sections.
+    
+    Args:
+        file_path: Path to the RST file to test
+        
+    Returns:
+        List of error messages, empty if no errors
+    """
+    with open(file_path, "r", encoding="utf-8") as rst_file:
+        content = rst_file.read()
+    
+    errors = []
+    
+    # Extract section titles
+    section_titles = []
+    lines = content.split("\n")
+    for i, line in enumerate(lines):
+        if i > 0 and len(lines[i-1].strip()) > 0:
+            if all(c == "=" for c in line.strip()) or all(c == "-" for c in line.strip()) or all(c == "~" for c in line.strip()):
+                section_titles.append(lines[i-1].strip())
+    
+    # Check if internal links point to valid sections
+    tree = publish_doctree(content)
+    for node in tree.traverse():
+        if node.tagname == "reference" and "refuri" in node.attributes:
+            ref_uri = node.attributes["refuri"]
+            if ref_uri.startswith("#"):
+                link_target = ref_uri[1:]
+                # Normalize target by removing spaces and converting to lowercase
+                normalized_target = link_target.lower().replace(" ", "-")
+                # Check if target exists in section titles
+                found = False
+                for title in section_titles:
+                    if normalized_target == title.lower().replace(" ", "-"):
+                        found = True
+                        break
+                if not found:
+                    errors.append(f"Internal link to '#{link_target}' does not match any section title")
+    
+    return errors
+
+
+def main():
+    """Run all tests on the DataFrame RST documentation."""
+    # Get the path to the RST file
+    current_dir = Path(os.path.dirname(os.path.abspath(__file__)))
+    dataframe_rst_path = current_dir / "dataframe.rst"
+    
+    if not dataframe_rst_path.exists():
+        print(f"Error: File not found: {dataframe_rst_path}")
+        return 1
+    
+    # Run tests
+    print(f"Testing {dataframe_rst_path}...")
+    
+    syntax_errors = test_rst_syntax(str(dataframe_rst_path))
+    if syntax_errors:
+        print("RST syntax errors found:")
+        for error in syntax_errors:
+            print(f"  - {error}")
+    else:
+        print("✓ RST syntax is valid")
+    
+    code_block_errors = test_code_blocks(str(dataframe_rst_path))
+    if code_block_errors:
+        print("Code block errors found:")
+        for error in code_block_errors:
+            print(f"  - {error}")
+    else:
+        print("✓ Code blocks are valid")
+    
+    link_errors = test_internal_links(str(dataframe_rst_path))
+    if link_errors:
+        print("Internal link errors found:")
+        for error in link_errors:
+            print(f"  - {error}")
+    else:
+        print("✓ Internal links are valid")
+    
+    build_success, build_errors = test_build_rst(str(dataframe_rst_path))
+    if not build_success:
+        print("Build errors found:")
+        for error in build_errors:
+            print(f"  - {error}")
+    else:
+        print("✓ Document builds successfully")
+    
+    # Overall result
+    if syntax_errors or code_block_errors or link_errors or not build_success:
+        print("\n❌ Tests failed")
+        return 1
+    else:
+        print("\n✅ All tests passed")
+        return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file

From 962f9ce497d0fc0367fcd0f2d3a61eba95138631 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 23 May 2025 11:05:42 +0800
Subject: [PATCH 3/8] refactor: remove test script for DataFrame API
 documentation in RST format

---
 docs/source/api/test_dataframe.rst | 198 -----------------------------
 1 file changed, 198 deletions(-)
 delete mode 100644 docs/source/api/test_dataframe.rst

diff --git a/docs/source/api/test_dataframe.rst b/docs/source/api/test_dataframe.rst
deleted file mode 100644
index 90b38d647..000000000
--- a/docs/source/api/test_dataframe.rst
+++ /dev/null
@@ -1,198 +0,0 @@
-# docs/source/api/test_dataframe.py
-"""Tests for the DataFrame API documentation in RST format.
-
-This script validates the RST syntax, links, and structure of the DataFrame
-API documentation files.
-"""
-
-import os
-import sys
-from pathlib import Path
-from typing import List, Tuple
-
-from docutils.core import publish_doctree, publish_parts
-from docutils.parsers.rst import Parser
-from docutils.utils import new_document, SystemMessage
-
-def test_rst_syntax(file_path: str) -> List[SystemMessage]:
-    """Test if the RST file has valid syntax.
-    
-    Args:
-        file_path: Path to the RST file to test
-        
-    Returns:
-        List of error messages, empty if no errors
-    """
-    with open(file_path, "r", encoding="utf-8") as rst_file:
-        content = rst_file.read()
-    
-    parser = Parser()
-    settings = {}
-    document = new_document("test", settings)
-    
-    # Parse the document and capture any errors/warnings
-    parser.parse(content, document)
-    
-    return [msg for msg in document.traverse(condition=SystemMessage)]
-
-
-def test_build_rst(file_path: str) -> Tuple[bool, List[str]]:
-    """Test if the RST file can be built into HTML without errors.
-    
-    Args:
-        file_path: Path to the RST file to test
-        
-    Returns:
-        Tuple containing (success status, list of error messages)
-    """
-    with open(file_path, "r", encoding="utf-8") as rst_file:
-        content = rst_file.read()
-    
-    try:
-        # Try to build the document to HTML
-        publish_parts(
-            source=content,
-            writer_name="html5",
-            settings_overrides={"halt_level": 2}  # Stop at warning level
-        )
-        return True, []
-    except Exception as e:
-        return False, [str(e)]
-
-
-def test_code_blocks(file_path: str) -> List[str]:
-    """Test if code blocks in the RST file are properly formatted.
-    
-    Args:
-        file_path: Path to the RST file to test
-        
-    Returns:
-        List of error messages, empty if no errors
-    """
-    with open(file_path, "r", encoding="utf-8") as rst_file:
-        content = rst_file.read()
-    
-    errors = []
-    lines = content.split("\n")
-    in_code_block = False
-    code_block_indent = 0
-    
-    for i, line in enumerate(lines, 1):
-        if ".. code-block::" in line:
-            in_code_block = True
-            code_block_indent = len(line) - len(line.lstrip())
-        elif in_code_block and line.strip() and not line.startswith(" " * (code_block_indent + 4)):
-            # Code block content should be indented by at least 4 spaces
-            if not line.strip().startswith(".. "):  # Skip RST directives
-                errors.append(f"Line {i}: Code block not properly indented")
-                in_code_block = False
-        elif in_code_block and not line.strip():
-            # Empty line within code block, still in code block
-            pass
-        elif in_code_block:
-            # If line doesn't start with proper indentation, we're out of the code block
-            if not line.startswith(" " * (code_block_indent + 4)):
-                in_code_block = False
-    
-    return errors
-
-
-def test_internal_links(file_path: str) -> List[str]:
-    """Test if internal links in the RST file point to valid sections.
-    
-    Args:
-        file_path: Path to the RST file to test
-        
-    Returns:
-        List of error messages, empty if no errors
-    """
-    with open(file_path, "r", encoding="utf-8") as rst_file:
-        content = rst_file.read()
-    
-    errors = []
-    
-    # Extract section titles
-    section_titles = []
-    lines = content.split("\n")
-    for i, line in enumerate(lines):
-        if i > 0 and len(lines[i-1].strip()) > 0:
-            if all(c == "=" for c in line.strip()) or all(c == "-" for c in line.strip()) or all(c == "~" for c in line.strip()):
-                section_titles.append(lines[i-1].strip())
-    
-    # Check if internal links point to valid sections
-    tree = publish_doctree(content)
-    for node in tree.traverse():
-        if node.tagname == "reference" and "refuri" in node.attributes:
-            ref_uri = node.attributes["refuri"]
-            if ref_uri.startswith("#"):
-                link_target = ref_uri[1:]
-                # Normalize target by removing spaces and converting to lowercase
-                normalized_target = link_target.lower().replace(" ", "-")
-                # Check if target exists in section titles
-                found = False
-                for title in section_titles:
-                    if normalized_target == title.lower().replace(" ", "-"):
-                        found = True
-                        break
-                if not found:
-                    errors.append(f"Internal link to '#{link_target}' does not match any section title")
-    
-    return errors
-
-
-def main():
-    """Run all tests on the DataFrame RST documentation."""
-    # Get the path to the RST file
-    current_dir = Path(os.path.dirname(os.path.abspath(__file__)))
-    dataframe_rst_path = current_dir / "dataframe.rst"
-    
-    if not dataframe_rst_path.exists():
-        print(f"Error: File not found: {dataframe_rst_path}")
-        return 1
-    
-    # Run tests
-    print(f"Testing {dataframe_rst_path}...")
-    
-    syntax_errors = test_rst_syntax(str(dataframe_rst_path))
-    if syntax_errors:
-        print("RST syntax errors found:")
-        for error in syntax_errors:
-            print(f"  - {error}")
-    else:
-        print("✓ RST syntax is valid")
-    
-    code_block_errors = test_code_blocks(str(dataframe_rst_path))
-    if code_block_errors:
-        print("Code block errors found:")
-        for error in code_block_errors:
-            print(f"  - {error}")
-    else:
-        print("✓ Code blocks are valid")
-    
-    link_errors = test_internal_links(str(dataframe_rst_path))
-    if link_errors:
-        print("Internal link errors found:")
-        for error in link_errors:
-            print(f"  - {error}")
-    else:
-        print("✓ Internal links are valid")
-    
-    build_success, build_errors = test_build_rst(str(dataframe_rst_path))
-    if not build_success:
-        print("Build errors found:")
-        for error in build_errors:
-            print(f"  - {error}")
-    else:
-        print("✓ Document builds successfully")
-    
-    # Overall result
-    if syntax_errors or code_block_errors or link_errors or not build_success:
-        print("\n❌ Tests failed")
-        return 1
-    else:
-        print("\n✅ All tests passed")
-        return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
\ No newline at end of file

From dca035c1cfc1558060abd5d7072758668b79954b Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 2 Jun 2025 16:18:10 +0800
Subject: [PATCH 4/8] fix: correct formatting inconsistencies in dataframe.rst

---
 docs/source/api/dataframe.rst | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/docs/source/api/dataframe.rst b/docs/source/api/dataframe.rst
index 5c5394071..83abc1a00 100644
--- a/docs/source/api/dataframe.rst
+++ b/docs/source/api/dataframe.rst
@@ -30,7 +30,7 @@ A DataFrame represents a logical plan that is lazily evaluated. The actual execu
 terminal operations like ``collect()``, ``show()``, or ``to_pandas()`` are called.
 
 Creating DataFrames
-------------------
+-------------------
 
 DataFrames can be created in several ways:
 
@@ -76,7 +76,7 @@ DataFrames can be created in several ways:
       df = ctx.from_arrow(batch)
 
 Common DataFrame Operations
---------------------------
+---------------------------
 
 DataFusion's DataFrame API offers a wide range of operations:
 
@@ -122,7 +122,7 @@ DataFusion's DataFrame API offers a wide range of operations:
     df = df.drop("temporary_column")
 
 Terminal Operations
-------------------
+-------------------
 
 To materialize the results of your DataFrame operations:
 
@@ -145,14 +145,14 @@ To materialize the results of your DataFrame operations:
     count = df.count()
 
 HTML Rendering in Jupyter
-------------------------
+-------------------------
 
 When working in Jupyter notebooks or other environments that support rich HTML display, 
 DataFusion DataFrames automatically render as nicely formatted HTML tables. This functionality
 is provided by the ``_repr_html_`` method, which is automatically called by Jupyter.
 
 Basic HTML Rendering
-~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~
 
 In a Jupyter environment, simply displaying a DataFrame object will trigger HTML rendering:
 
@@ -165,13 +165,13 @@ In a Jupyter environment, simply displaying a DataFrame object will trigger HTML
     display(df)
 
 HTML Rendering Customization
----------------------------
+----------------------------
 
 DataFusion provides extensive customization options for HTML table rendering through the
 ``datafusion.html_formatter`` module.
 
 Configuring the HTML Formatter
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 You can customize how DataFrames are rendered by configuring the formatter:
 
@@ -194,7 +194,7 @@ You can customize how DataFrames are rendered by configuring the formatter:
     )
 
 Custom Style Providers
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~
 
 For advanced styling needs, you can create a custom style provider class:
 
@@ -213,7 +213,7 @@ For advanced styling needs, you can create a custom style provider class:
     configure_formatter(style_provider=CustomStyleProvider())
 
 Custom Type Formatters
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~
 
 You can register custom formatters for specific data types:
 
@@ -236,7 +236,7 @@ You can register custom formatters for specific data types:
     formatter.register_formatter(datetime.date, format_date)
 
 Custom Cell Builders
-~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~
 
 For complete control over cell rendering:
 
@@ -260,7 +260,7 @@ For complete control over cell rendering:
     formatter.set_custom_cell_builder(custom_cell_builder)
 
 Custom Header Builders
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~
 
 Similarly, you can customize the rendering of table headers:
 
@@ -273,7 +273,7 @@ Similarly, you can customize the rendering of table headers:
     formatter.set_custom_header_builder(custom_header_builder)
 
 Managing Formatter State
------------------------
+-----------------------~
 
 The HTML formatter maintains global state that can be managed:
 
@@ -291,7 +291,7 @@ The HTML formatter maintains global state that can be managed:
     formatter = get_formatter()
 
 Advanced Example: Dashboard-Style Formatting
-------------------------------------------
+------------------------------------------~~
 
 This example shows how to create a dashboard-like styling for your DataFrames:
 
@@ -355,7 +355,7 @@ This example shows how to create a dashboard-like styling for your DataFrames:
     formatter.register_formatter(float, format_number)
 
 Best Practices
--------------
+--------------
 
 1. **Memory Management**: For large datasets, use ``max_memory_bytes`` to limit memory usage.
 
@@ -368,7 +368,7 @@ Best Practices
 5. **Cell Expansion**: Use ``enable_cell_expansion=True`` when cells might contain longer content that users may want to see in full.
 
 Additional Resources
--------------------
+--------------------
 
 * `DataFusion User Guide <../user-guide/dataframe.html>`_ - Complete guide to using DataFrames
 * `API Reference <https://arrow.apache.org/datafusion-python/api/index.html>`_ - Full API reference

From d7d67fb8c0f0e3dce32aa654798217a6cf516690 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Tue, 3 Jun 2025 09:21:21 +0800
Subject: [PATCH 5/8] fix: correct header formatting in functions.rst

---
 docs/source/user-guide/common-operations/functions.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/user-guide/common-operations/functions.rst b/docs/source/user-guide/common-operations/functions.rst
index d458d3eb0..ccb47a4e7 100644
--- a/docs/source/user-guide/common-operations/functions.rst
+++ b/docs/source/user-guide/common-operations/functions.rst
@@ -132,7 +132,7 @@ The function :py:func:`~datafusion.functions.in_list` allows to check a column f
 
 
 Handling Missing Values 
-=====================
+=======================
 
 DataFusion provides methods to handle missing values in DataFrames:
 

From c4c8d142fedd97c9e7ea66c465d6c960df3b7359 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Tue, 3 Jun 2025 09:40:34 +0800
Subject: [PATCH 6/8] fix: adjust formatting for code block in dataframe.rst

---
 docs/source/user-guide/dataframe.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/user-guide/dataframe.rst b/docs/source/user-guide/dataframe.rst
index 11e3d7e72..23c65b5f6 100644
--- a/docs/source/user-guide/dataframe.rst
+++ b/docs/source/user-guide/dataframe.rst
@@ -122,7 +122,8 @@ Performance Optimization with Shared Styles
 The ``use_shared_styles`` parameter (enabled by default) optimizes performance when displaying 
 multiple DataFrames in notebook environments:
 
- .. code-block:: python
+.. code-block:: python
+
     from datafusion.html_formatter import StyleProvider, configure_formatter
     # Default: Use shared styles (recommended for notebooks)
     configure_formatter(use_shared_styles=True)

From 391bfe4d750f26d1b28825dce3eaec6c0ba86a14 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Tue, 3 Jun 2025 09:51:42 +0800
Subject: [PATCH 7/8] fix: skip documentation for duplicate modules in autoapi
 configuration

---
 docs/source/conf.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 0ca124fd1..28db17d35 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -80,6 +80,9 @@ def autoapi_skip_member_fn(app, what, name, obj, skip, options) -> bool:  # noqa
         ("class", "datafusion.DataFrame"),
         ("class", "datafusion.SessionContext"),
         ("module", "datafusion.common"),
+        # Duplicate modules (skip module-level docs to avoid duplication)
+        ("module", "datafusion.col"),
+        ("module", "datafusion.udf"),
         # Deprecated
         ("class", "datafusion.substrait.serde"),
         ("class", "datafusion.substrait.plan"),

From ce9dd9171c056b4020de0ea568d7ddff84ea59b8 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Tue, 3 Jun 2025 10:22:15 +0800
Subject: [PATCH 8/8] fix: add cross reference to io pages

---
 docs/source/api/dataframe.rst | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/docs/source/api/dataframe.rst b/docs/source/api/dataframe.rst
index 83abc1a00..a9e9e47c8 100644
--- a/docs/source/api/dataframe.rst
+++ b/docs/source/api/dataframe.rst
@@ -53,15 +53,18 @@ DataFrames can be created in several ways:
 
   .. code-block:: python
 
-      # From CSV files
+      # From CSV files (see :ref:`io_csv` for detailed options)
       df = ctx.read_csv("path/to/data.csv")
       
-      # From Parquet files
+      # From Parquet files (see :ref:`io_parquet` for detailed options)
       df = ctx.read_parquet("path/to/data.parquet")
       
-      # From JSON files
+      # From JSON files (see :ref:`io_json` for detailed options)
       df = ctx.read_json("path/to/data.json")
       
+      # From Avro files (see :ref:`io_avro` for detailed options)
+      df = ctx.read_avro("path/to/data.avro")
+      
       # From Pandas DataFrame
       import pandas as pd
       pandas_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
@@ -75,6 +78,9 @@ DataFrames can be created in several ways:
       )
       df = ctx.from_arrow(batch)
 
+  For detailed information about reading from different data sources, see the :doc:`I/O Guide <../user-guide/io/index>`.
+  For custom data sources, see :ref:`io_custom_table_provider`.
+
 Common DataFrame Operations
 ---------------------------
 
@@ -370,5 +376,12 @@ Best Practices
 Additional Resources
 --------------------
 
-* `DataFusion User Guide <../user-guide/dataframe.html>`_ - Complete guide to using DataFrames
+* :doc:`../user-guide/dataframe` - Complete guide to using DataFrames
+* :doc:`../user-guide/io/index` - I/O Guide for reading data from various sources
+* :doc:`../user-guide/data-sources` - Comprehensive data sources guide
+* :ref:`io_csv` - CSV file reading
+* :ref:`io_parquet` - Parquet file reading  
+* :ref:`io_json` - JSON file reading
+* :ref:`io_avro` - Avro file reading
+* :ref:`io_custom_table_provider` - Custom table providers
 * `API Reference <https://arrow.apache.org/datafusion-python/api/index.html>`_ - Full API reference