more comments

jprakash-db · jprakash-db · commit 06551a0f20b6 · 2025-07-14T13:42:12.000+05:30
diff --git a/src/databricks/sql/client.py b/src/databricks/sql/client.py
@@ -1402,17 +1402,42 @@ def _convert_columnar_table(self, table):
             result.append(ResultRow(*curr_row))
 
         return result
+    
+    def print_mem(self):
+        import os
+        import psutil
+
+        process = psutil.Process(os.getpid())
+        mem_info = process.memory_info()
+        total_mem_mb = mem_info.rss / 1024 / 1024
+        cpu_percent = process.cpu_percent(interval=0.1)
+        print(f"Total memory usage: {total_mem_mb:.2f} MB")
+        print(f"CPU percent: {cpu_percent:.2f}%")
+        # total_size_bytes = table.get_total_buffer_size()
+        # total_size_mb = total_size_bytes / (1024 * 1024)
+
+        # print(f"Total PyArrow table size: {total_size_bytes} bytes ({total_size_mb:.2f} MB)")
 
     def _convert_arrow_table(self, table: "pyarrow.Table"):
+        import sys
+        from pympler import asizeof
+        
+        self.print_mem()
+        print(f"Memory size table: {table.nbytes / (1024 ** 2):.2f} MB")
+        # Convert to MB for easier reading
         column_names = [c[0] for c in self.description]
         ResultRow = Row(*column_names)
 
         if self.connection.disable_pandas is True:
             start_time = time.time()
             columns_as_lists = [col.to_pylist() for col in table.itercolumns()]
+            self.print_mem()
+            print(f"Memory size columns_as_lists: {sum(sys.getsizeof(col) for col in columns_as_lists) / (1024 ** 2):.2f} MB")
             res = [ResultRow(*row) for row in zip(*columns_as_lists)]
+            self.print_mem()
             end_time = time.time()
             print(f"Time taken to convert arrow table to list: {end_time - start_time} seconds")
+            print(f"Memory size res: {sum(sys.getsizeof(row) for row in res) / (1024 ** 2):.2f} MB")
             return res
         
         start_time = time.time()
@@ -1436,14 +1461,23 @@ def _convert_arrow_table(self, table: "pyarrow.Table"):
 
         # Need to rename columns, as the to_pandas function cannot handle duplicate column names
         table_renamed = table.rename_columns([str(c) for c in range(table.num_columns)])
+        print(f"Memory size table_renamed: {table_renamed.nbytes / (1024 ** 2):.2f} MB")
         df = table_renamed.to_pandas(
             types_mapper=dtype_mapping.get,
             date_as_object=True,
             timestamp_as_object=True,
+            self_destruct=True,
         )
+        print(f"Memory size df: {df.memory_usage(deep=True).sum() / (1024 ** 2):.2f} MB")
+        self.print_mem()
+        # del table_renamed
 
         res = df.to_numpy(na_value=None, dtype="object")
+        print(f"Memory size res: {res.nbytes / (1024 ** 2):.2f} MB")
+        self.print_mem()
+        # del df
         tmp_res = [ResultRow(*v) for v in res]
+        self.print_mem()
         end_time = time.time()
         print(f"Time taken to convert arrow table to list: {end_time - start_time} seconds")
         return tmp_res
@@ -1471,7 +1505,7 @@ def fetchmany_arrow(self, size: int) -> "pyarrow.Table":
             and not self.has_been_closed_server_side
             and self.has_more_rows
         ):
-            print(f"TOTAL DATA ROWS {TOTAL_SIZE}")
+            # print(f"TOTAL DATA ROWS {TOTAL_SIZE}")
             self._fill_results_buffer()
             partial_results = self.results.next_n_rows(n_remaining_rows)
             results.append(partial_results)
@@ -1515,11 +1549,11 @@ def fetchall_arrow(self) -> "pyarrow.Table":
         self._next_row_index += results.num_rows
         
         # partial_result_chunks = [results]
-        print("Server side has more rows", self.has_more_rows)
+        # print("Server side has more rows", self.has_more_rows)
         TOTAL_SIZE = results.num_rows
 
         while not self.has_been_closed_server_side and self.has_more_rows:
-            print(f"TOTAL DATA ROWS {TOTAL_SIZE}")
+            # print(f"TOTAL DATA ROWS {TOTAL_SIZE}")
             self._fill_results_buffer()
             partial_results = self.results.remaining_rows()
             results.append(partial_results)
diff --git a/src/databricks/sql/cloudfetch/download_manager.py b/src/databricks/sql/cloudfetch/download_manager.py
@@ -84,8 +84,8 @@ def _schedule_downloads(self):
         """
         While download queue has a capacity, peek pending links and submit them to thread pool.
         """
-        print("Schedule_downloads")
-        logger.debug("ResultFileDownloadManager: schedule downloads")
+        # print("Schedule_downloads")
+        # logger.debug("ResultFileDownloadManager: schedule downloads")
         while (len(self._download_tasks) < self._max_download_threads) and (
             len(self._pending_links) > 0
         ):
diff --git a/src/databricks/sql/cloudfetch/downloader.py b/src/databricks/sql/cloudfetch/downloader.py
@@ -99,9 +99,9 @@ def run(self) -> DownloadedFile:
             verify=self._ssl_options.tls_verify,
             headers=self.link.httpHeaders
         ) as response:
-            print_text = [
+            # print_text = [
 
-            ]
+            # ]
             
             response.raise_for_status()
             
@@ -127,12 +127,12 @@ def run(self) -> DownloadedFile:
                 )
             )
 
-            print_text.append(
-                f"Downloaded file startRowOffset - {self.link.startRowOffset} - rowCount - {self.link.rowCount}"
-            )
+            # print_text.append(
+            #     f"Downloaded file startRowOffset - {self.link.startRowOffset} - rowCount - {self.link.rowCount}"
+            # )
 
-            for text in print_text:
-                print(text)
+            # for text in print_text:
+            #     print(text)
 
             return DownloadedFile(
                 decompressed_data,
diff --git a/src/databricks/sql/common/http.py b/src/databricks/sql/common/http.py
@@ -73,7 +73,7 @@ def execute(
             start_time = time.time()
             response = self.session.request(method.value, url, **kwargs)
             end_time = time.time()
-            print(f"Downloaded file in {end_time - start_time} seconds")
+            # print(f"Downloaded file in {end_time - start_time} seconds")
             yield response
         except Exception as e:
             logger.error("Error executing HTTP request in DatabricksHttpClient: %s", e)
diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py
@@ -219,7 +219,12 @@ def batch_generator():
                 yield self.convert_decimals_in_record_batch(batch)
 
         return pyarrow.Table.from_batches(batch_generator())
-
+    
+    def remove_extraneous_rows(self):
+        num_rows_in_data = sum(batch.num_rows for batch in self.record_batches)
+        if num_rows_in_data > self.num_rows:
+            self.record_batches = self.record_batches[:self.num_rows]
+            self.num_rows = self.num_rows
     
 class ColumnQueue(ResultSetQueue):
     def __init__(self, column_table: ColumnTable):
@@ -319,8 +324,8 @@ def __init__(
                         result_link.startRowOffset, result_link.rowCount
                     )
                 )
-        print("Initial Setup Cloudfetch Queue")
-        print(f"No of result links - {len(result_links)}")
+        # print("Initial Setup Cloudfetch Queue")
+        # print(f"No of result links - {len(result_links)}")
         self.download_manager = ResultFileDownloadManager(
             links=result_links or [],
             max_download_threads=self.max_download_threads,
@@ -383,8 +388,8 @@ def remaining_rows(self):
         # results = self.table.slice(0, 0)
         # result = self._create_empty_table()
 
-        print("remaining_rows call")
-        print(f"self.table.num_rows - {self.table.num_rows}")
+        # print("remaining_rows call")
+        # print(f"self.table.num_rows - {self.table.num_rows}")
         while self.table:
             # table_slice = self.table.slice(
             #     self.table_row_index, self.table.num_rows - self.table_row_index
@@ -393,7 +398,7 @@ def remaining_rows(self):
             # self.table_row_index += table_slice.num_rows
             self.table = self._create_next_table()
             # self.table_row_index = 0
-        print(f"result.num_rows - {result.num_rows}")
+        # print(f"result.num_rows - {result.num_rows}")
         return result
 
     def _create_next_table(self) -> ArrowStreamTable:
@@ -419,6 +424,8 @@ def _create_next_table(self) -> ArrowStreamTable:
             list(pyarrow.ipc.open_stream(downloaded_file.file_bytes)), 
             downloaded_file.row_count, 
             self.description)
+        
+        arrow_stream_table.remove_extraneous_rows()
         # arrow_table = create_arrow_table_from_arrow_file(
         #     downloaded_file.file_bytes, self.description
         # )
@@ -439,8 +446,8 @@ def _create_next_table(self) -> ArrowStreamTable:
             )
         )
         
-        print("_create_next_table")
-        print(f"arrow_stream_table.num_rows - {arrow_stream_table.num_rows}")
+        # print("_create_next_table")
+        # print(f"arrow_stream_table.num_rows - {arrow_stream_table.num_rows}")
         return arrow_stream_table
 
     def _create_empty_table(self) -> ArrowStreamTable: