databricks
diff --git a/‎examples/experimental/sea_cloud_fetch_test.py‎
Lines changed: 179 additions & 0 deletions b/‎examples/experimental/sea_cloud_fetch_test.py‎
Lines changed: 179 additions & 0 deletions
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+"""
+Test script specifically for testing SEA cloud fetch with multiple chunks.
+This script focuses on verifying that the Python connector can properly handle
+multiple chunks from the SEA API.
+"""
+
+import os
+import sys
+import logging
+import time
+import pyarrow
+from databricks.sql.client import Connection
+
+# Set up detailed logging
+logging.basicConfig(
+    level=logging.DEBUG,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+def test_sea_cloud_fetch_multiple_chunks():
+    """
+    Test the SEA cloud fetch implementation with multiple chunks.
+    
+    This function specifically tests:
+    1. Very large dataset to force multiple chunks in the manifest
+    2. Detailed logging of chunk information
+    3. Progressive fetching to observe chunk transitions
+    """
+    # Use the values from export-tests-e2e.sh
+    server_hostname = "adb-6436897454825492.12.azuredatabricks.net"
+    http_path = "/sql/1.0/warehouses/2f03dd43e35e2aa0"
+    access_token = os.environ.get("DATABRICKS_TOKEN")
+    catalog = "peco"
+    schema = "default"  # Using default schema which should be available
+
+    if not access_token:
+        logger.error("Missing required environment variable DATABRICKS_TOKEN.")
+        sys.exit(1)
+
+    try:
+        # Create connection with SEA backend
+        logger.info("Creating connection with SEA backend...")
+        connection = Connection(
+            server_hostname=server_hostname,
+            http_path=http_path,
+            access_token=access_token,
+            catalog=catalog,
+            schema=schema,
+            use_sea=True,
+            use_cloud_fetch=True,  # Enable cloud fetch to trigger EXTERNAL_LINKS + ARROW
+            user_agent_entry="SEA-Test-Client",
+        )
+
+        logger.info(f"Successfully opened SEA session with ID: {connection.get_session_id_hex()}")
+        
+        # Create cursor with a moderate arraysize
+        # We want chunks to be based on data size, not row count
+        cursor = connection.cursor(arraysize=1000)
+        
+        # Execute a query that generates a VERY large dataset with large string values
+        # This should force the server to split the result into multiple chunks
+        query = """
+        WITH large_dataset AS (
+            SELECT 
+                id,
+                id * 2 as double_id,
+                id * 3 as triple_id,
+                concat('value_', repeat(cast(id as string), 100)) as large_string_value,
+                array_repeat(id, 50) as large_array_value,
+                rand() as random_val,
+                current_timestamp() as current_time
+            FROM range(1, 100000) AS t(id)
+        )
+        SELECT * FROM large_dataset
+        """
+        logger.info(f"Executing query with large complex data: {query}")
+        cursor.execute(query)
+        
+        # Access the underlying result set to log chunk information
+        result_set = cursor.active_result_set
+        
+        # Directly examine the raw API response to see the full manifest
+        if hasattr(result_set, 'results') and hasattr(result_set.results, '_sea_client') and hasattr(result_set.results, '_statement_id'):
+            logger.info("Examining raw API response to see full manifest...")
+            sea_client = result_set.results._sea_client
+            statement_id = result_set.results._statement_id
+            
+            # Make a direct API call to get the statement status
+            try:
+                response = sea_client.http_client._make_request(
+                    method="GET",
+                    path=f"/api/2.0/sql/statements/{statement_id}",
+                    data={}
+                )
+                
+                # Log the manifest information
+                if "manifest" in response:
+                    manifest = response["manifest"]
+                    logger.info(f"Manifest from API: {manifest}")
+                    
+                    # Log chunk information
+                    if "chunks" in manifest:
+                        chunks = manifest["chunks"]
+                        logger.info(f"Number of chunks in manifest: {len(chunks)}")
+                        for i, chunk in enumerate(chunks):
+                            logger.info(f"Chunk {i}: index={chunk.get('chunk_index')}, row_count={chunk.get('row_count')}, row_offset={chunk.get('row_offset')}, byte_count={chunk.get('byte_count')}")
+                    
+                    # Log total_row_count and total_chunk_count
+                    if "total_row_count" in manifest:
+                        total_row_count = manifest["total_row_count"]
+                        logger.info(f"Total row count: {total_row_count}")
+                    
+                    if "total_chunk_count" in manifest:
+                        total_chunk_count = manifest["total_chunk_count"]
+                        logger.info(f"Total chunk count: {total_chunk_count}")
+                
+                # Log the external links information
+                if "result" in response and "external_links" in response["result"]:
+                    external_links = response["result"]["external_links"]
+                    logger.info(f"Number of external links in response: {len(external_links)}")
+                    for i, link in enumerate(external_links):
+                        logger.info(f"Link {i}: chunk_index={link.get('chunk_index')}, row_count={link.get('row_count')}, next_chunk_index={link.get('next_chunk_index')}")
+            except Exception as e:
+                logger.warning(f"Error examining raw API response: {str(e)}")
+        
+        # Log initial chunk information
+        if hasattr(result_set, 'results'):
+            logger.info(f"Result set type: {type(result_set.results)}")
+            
+            if hasattr(result_set.results, '_total_chunk_count'):
+                logger.info(f"Total chunks: {result_set.results._total_chunk_count}")
+            
+            if hasattr(result_set.results, 'initial_links'):
+                logger.info(f"Initial links count: {len(result_set.results.initial_links)}")
+                for i, link in enumerate(result_set.results.initial_links):
+                    logger.info(f"Link {i}: chunk_index={link.chunk_index}, row_count={link.row_count}, next_chunk_index={link.next_chunk_index}")
+            
+            if hasattr(result_set.results, '_fetched_chunk_indices'):
+                logger.info(f"Initial fetched chunk indices: {result_set.results._fetched_chunk_indices}")
+        
+        # Fetch data in progressively larger batches to force multiple chunk fetches
+        # We'll fetch a large number of rows to ensure we need to fetch multiple chunks
+        batch_sizes = [1000, 5000, 10000, 20000]
+        total_rows_fetched = 0
+        
+        for i, batch_size in enumerate(batch_sizes):
+            logger.info(f"Fetching batch {i+1} ({batch_size} rows)...")
+            rows = cursor.fetchmany(batch_size)
+            total_rows_fetched += len(rows)
+            logger.info(f"Fetched {len(rows)} rows (total: {total_rows_fetched})")
+            
+            # Log chunk information after each fetch
+            if hasattr(result_set, 'results') and hasattr(result_set.results, '_fetched_chunk_indices'):
+                logger.info(f"Fetched chunk indices after batch {i+1}: {result_set.results._fetched_chunk_indices}")
+                
+                # If we have links, log information about them
+                if hasattr(result_set.results, '_chunk_index_to_link'):
+                    for chunk_index in sorted(result_set.results._fetched_chunk_indices):
+                        if chunk_index in result_set.results._chunk_index_to_link:
+                            link = result_set.results._chunk_index_to_link[chunk_index]
+                            logger.info(f"  Chunk {chunk_index}: row_count={link.row_count}, next_chunk_index={link.next_chunk_index}")
+        
+        # Close cursor and connection
+        cursor.close()
+        connection.close()
+        logger.info("Successfully closed SEA session")
+
+    except Exception as e:
+        logger.error(f"Error during SEA cloud fetch test: {str(e)}")
+        import traceback
+        logger.error(traceback.format_exc())
+        sys.exit(1)
+
+    logger.info("SEA cloud fetch test with multiple chunks completed successfully")
+
+if __name__ == "__main__":
+    test_sea_cloud_fetch_multiple_chunks()