Change result fetching to use fetch.next

NiallEgan · susodapop · commit fd2602cbbf1b · 2022-06-02T11:14:46.000-05:00
Elsewhere I found that calling fetch all() after a create temp view resulted in the following stack trace:
```
Traceback (most recent call last):
2  File "&lt;stdin&gt;", line 1, in &lt;module&gt;
3  File "/Users/niallegan/opt/miniconda3/lib/python3.7/site-packages/databricks/sql/client.py", line 318, in fetchall
4    return self.active_result_set.fetchall()
5  File "/Users/niallegan/opt/miniconda3/lib/python3.7/site-packages/databricks/sql/client.py", line 522, in fetchall
6    return self._convert_arrow_table(self.fetchall_arrow())
7  File "/Users/niallegan/opt/miniconda3/lib/python3.7/site-packages/databricks/sql/client.py", line 496, in fetchall_arrow
8    results = self.results.remaining_rows()
9  File "/Users/niallegan/opt/miniconda3/lib/python3.7/site-packages/databricks/sql/utils.py", line 27, in remaining_rows
10    slice = self.arrow_table.slice(self.cur_row_index, self.n_valid_rows - self.cur_row_index)
11  File "pyarrow/table.pxi", line 1125, in pyarrow.lib.Table.slice
12IndexError: Offset must be non-negative
```
The problem was that the startRowOffset was after what we requested.

However, it turns out that client side we should be ignoring and not setting `startRowOffset` since Thrift uses FETCH_NEXT by default. This PR changes to make `FETCH_NEXT` explicit and stop tracking the row offset.

I also took this opportunity to improve unit test coverage for dealing with the arrow batches and row counts.

* Manually running SELECT * from temp view test against prod
* New smoke test for SELECT * from temp view
* Increase unit test coverage

  - Did you add usage logs or metrics? Please mention them here.
  - Create dashboards or monitoring notebooks? Please link them here.
  - See http://go/obs/user for docs on our observability tools.
diff --git a/cmdexec/clients/python/src/databricks/sql/client.py b/cmdexec/clients/python/src/databricks/sql/client.py
@@ -423,11 +423,11 @@ def __init__(self,
         self.has_been_closed_server_side = execute_response.has_been_closed_server_side
         self.has_more_rows = execute_response.has_more_rows
         self.buffer_size_bytes = result_buffer_size_bytes
-        self._row_index = 0
         self.arraysize = arraysize
         self.thrift_backend = thrift_backend
         self.description = execute_response.description
         self._arrow_schema = execute_response.arrow_schema
+        self._next_row_index = 0
 
         if execute_response.arrow_queue:
             # In this case the server has taken the fast path and returned an initial batch of
@@ -447,8 +447,12 @@ def __iter__(self):
 
     def _fill_results_buffer(self):
         results, has_more_rows = self.thrift_backend.fetch_results(
-            self.command_id, self.arraysize, self.buffer_size_bytes, self._row_index,
-            self._arrow_schema, self.description)
+            op_handle=self.command_id,
+            max_rows=self.arraysize,
+            max_bytes=self.buffer_size_bytes,
+            expected_row_start_offset=self._next_row_index,
+            arrow_schema=self._arrow_schema,
+            description=self.description)
         self.results = results
         self.has_more_rows = has_more_rows
 
@@ -468,27 +472,27 @@ def fetchmany_arrow(self, n_rows: int) -> pyarrow.Table:
             raise ValueError("n_rows argument for fetchmany is %s but must be >= 0", n_rows)
         results = self.results.next_n_rows(n_rows)
         n_remaining_rows = n_rows - results.num_rows
-        self._row_index += results.num_rows
+        self._next_row_index += results.num_rows
 
         while n_remaining_rows > 0 and not self.has_been_closed_server_side and self.has_more_rows:
             self._fill_results_buffer()
             partial_results = self.results.next_n_rows(n_remaining_rows)
             results = pyarrow.concat_tables([results, partial_results])
             n_remaining_rows -= partial_results.num_rows
-            self._row_index += partial_results.num_rows
+            self._next_row_index += partial_results.num_rows
 
         return results
 
     def fetchall_arrow(self) -> pyarrow.Table:
         """Fetch all (remaining) rows of a query result, returning them as a PyArrow table."""
         results = self.results.remaining_rows()
-        self._row_index += results.num_rows
+        self._next_row_index += results.num_rows
 
         while not self.has_been_closed_server_side and self.has_more_rows:
             self._fill_results_buffer()
             partial_results = self.results.remaining_rows()
             results = pyarrow.concat_tables([results, partial_results])
-            self._row_index += partial_results.num_rows
+            self._next_row_index += partial_results.num_rows
 
         return results
 
diff --git a/cmdexec/clients/python/src/databricks/sql/thrift_backend.py b/cmdexec/clients/python/src/databricks/sql/thrift_backend.py
@@ -639,7 +639,8 @@ def _handle_execute_response(self, resp, cursor):
 
         return self._results_message_to_execute_response(resp, final_operation_state)
 
-    def fetch_results(self, op_handle, max_rows, max_bytes, row_offset, arrow_schema, description):
+    def fetch_results(self, op_handle, max_rows, max_bytes, expected_row_start_offset, arrow_schema,
+                      description):
         assert (op_handle is not None)
 
         req = ttypes.TFetchResultsReq(
@@ -651,12 +652,14 @@ def fetch_results(self, op_handle, max_rows, max_bytes, row_offset, arrow_schema
             ),
             maxRows=max_rows,
             maxBytes=max_bytes,
-            startRowOffset=row_offset,
-        )
+            orientation=ttypes.TFetchOrientation.FETCH_NEXT)
 
         resp = self.make_request(self._client.FetchResults, req)
+        if resp.results.startRowOffset > expected_row_start_offset:
+            logger.warning("Expected results to start from {} but they instead start at {}".format(
+                expected_row_start_offset, resp.results.startRowOffset))
         arrow_results, n_rows = self._create_arrow_table(resp.results, arrow_schema, description)
-        arrow_queue = ArrowQueue(arrow_results, n_rows, row_offset - resp.results.startRowOffset)
+        arrow_queue = ArrowQueue(arrow_results, n_rows)
 
         return arrow_queue, resp.hasMoreRows
 
diff --git a/cmdexec/clients/python/src/databricks/sql/utils.py b/cmdexec/clients/python/src/databricks/sql/utils.py
@@ -5,7 +5,7 @@
 
 
 class ArrowQueue:
-    def __init__(self, arrow_table: pyarrow.Table, n_valid_rows: int, start_row_index: int):
+    def __init__(self, arrow_table: pyarrow.Table, n_valid_rows: int, start_row_index: int = 0):
         """
         A queue-like wrapper over an Arrow table
 
@@ -20,6 +20,8 @@ def __init__(self, arrow_table: pyarrow.Table, n_valid_rows: int, start_row_inde
     def next_n_rows(self, num_rows: int) -> pyarrow.Table:
         """Get upto the next n rows of the Arrow dataframe"""
         length = min(num_rows, self.n_valid_rows - self.cur_row_index)
+        # Note that the table.slice API is not the same as Python's slice
+        # The second argument should be length, not end index
         slice = self.arrow_table.slice(self.cur_row_index, length)
         self.cur_row_index += slice.num_rows
         return slice
diff --git a/cmdexec/clients/python/tests/test_arrow_queue.py b/cmdexec/clients/python/tests/test_arrow_queue.py
@@ -0,0 +1,26 @@
+import unittest
+
+import pyarrow as pa
+
+from databricks.sql.utils import ArrowQueue
+
+
+class ArrowQueueSuite(unittest.TestCase):
+    @staticmethod
+    def make_arrow_table(batch):
+        n_cols = len(batch[0]) if batch else 0
+        schema = pa.schema({"col%s" % i: pa.uint32() for i in range(n_cols)})
+        cols = [[batch[row][col] for row in range(len(batch))] for col in range(n_cols)]
+        return pa.Table.from_pydict(dict(zip(schema.names, cols)), schema=schema)
+
+    def test_fetchmany_respects_n_rows(self):
+        arrow_table = self.make_arrow_table([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]])
+        aq = ArrowQueue(arrow_table, 3)
+        self.assertEqual(aq.next_n_rows(2), self.make_arrow_table([[0, 1, 2], [3, 4, 5]]))
+        self.assertEqual(aq.next_n_rows(2), self.make_arrow_table([[6, 7, 8]]))
+
+    def test_fetch_remaining_rows_respects_n_rows(self):
+        arrow_table = self.make_arrow_table([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]])
+        aq = ArrowQueue(arrow_table, 3)
+        self.assertEqual(aq.next_n_rows(1), self.make_arrow_table([[0, 1, 2]]))
+        self.assertEqual(aq.remaining_rows(), self.make_arrow_table([[3, 4, 5], [6, 7, 8]]))
diff --git a/cmdexec/clients/python/tests/test_fetches.py b/cmdexec/clients/python/tests/test_fetches.py
@@ -24,7 +24,7 @@ def make_arrow_table(batch):
     @staticmethod
     def make_arrow_queue(batch):
         _, table = FetchTests.make_arrow_table(batch)
-        queue = ArrowQueue(table, len(batch), 0)
+        queue = ArrowQueue(table, len(batch))
         return queue
 
     @staticmethod
@@ -51,7 +51,8 @@ def make_dummy_result_set_from_initial_results(initial_results):
     def make_dummy_result_set_from_batch_list(batch_list):
         batch_index = 0
 
-        def fetch_results(op_handle, max_rows, max_bytes, row_offset, arrow_schema, description):
+        def fetch_results(op_handle, max_rows, max_bytes, expected_row_start_offset, arrow_schema,
+                          description):
             nonlocal batch_index
             results = FetchTests.make_arrow_queue(batch_list[batch_index])
             batch_index += 1
diff --git a/cmdexec/clients/python/tests/test_thrift_backend.py b/cmdexec/clients/python/tests/test_thrift_backend.py
@@ -498,11 +498,48 @@ def test_handle_execute_response_reads_has_more_rows_in_result_response(
                 thrift_backend = self._make_fake_thrift_backend()
 
                 thrift_backend._handle_execute_response(execute_resp, Mock())
-                _, has_more_rows_resp = thrift_backend.fetch_results(Mock(), 1, 1, 0, Mock(),
-                                                                     Mock())
+                _, has_more_rows_resp = thrift_backend.fetch_results(
+                    op_handle=Mock(),
+                    max_rows=1,
+                    max_bytes=1,
+                    expected_row_start_offset=0,
+                    arrow_schema=Mock(),
+                    description=Mock())
 
                 self.assertEqual(has_more_rows, has_more_rows_resp)
 
+    @patch("databricks.sql.thrift_backend.TCLIService.Client")
+    def test_arrow_batches_row_count_are_respected(self, tcli_service_class):
+        # make some semi-real arrow batches and check the number of rows is correct in the queue
+        tcli_service_instance = tcli_service_class.return_value
+        t_fetch_results_resp = ttypes.TFetchResultsResp(
+            status=self.okay_status,
+            hasMoreRows=False,
+            results=ttypes.TRowSet(
+                startRowOffset=0,
+                rows=[],
+                arrowBatches=[
+                    ttypes.TSparkArrowBatch(batch=bytearray(), rowCount=15) for _ in range(10)
+                ]))
+        tcli_service_instance.FetchResults.return_value = t_fetch_results_resp
+        schema = pyarrow.schema([
+            pyarrow.field("column1", pyarrow.int32()),
+            pyarrow.field("column2", pyarrow.string()),
+            pyarrow.field("column3", pyarrow.float64()),
+            pyarrow.field("column3", pyarrow.binary())
+        ])
+
+        thrift_backend = ThriftBackend("foobar", 443, "path", [])
+        arrow_queue, has_more_results = thrift_backend.fetch_results(
+            op_handle=Mock(),
+            max_rows=1,
+            max_bytes=1,
+            expected_row_start_offset=0,
+            arrow_schema=schema,
+            description=MagicMock())
+
+        self.assertEqual(arrow_queue.n_valid_rows, 15 * 10)
+
     @patch("databricks.sql.thrift_backend.TCLIService.Client")
     def test_execute_statement_calls_client_and_handle_execute_response(self, tcli_service_class):
         tcli_service_instance = tcli_service_class.return_value
diff --git a/cmdexec/clients/python/tests/tests.py b/cmdexec/clients/python/tests/tests.py
@@ -12,6 +12,7 @@
 
 from test_fetches import FetchTests
 from test_thrift_backend import ThriftBackendTestSuite
+from test_arrow_queue import ArrowQueueSuite
 
 
 class ClientTestSuite(unittest.TestCase):
@@ -342,7 +343,7 @@ def test_version_is_canonical(self):
 if __name__ == '__main__':
     suite = unittest.TestLoader().loadTestsFromModule(sys.modules[__name__])
     loader = unittest.TestLoader()
-    test_classes = [ClientTestSuite, FetchTests, ThriftBackendTestSuite]
+    test_classes = [ClientTestSuite, FetchTests, ThriftBackendTestSuite, ArrowQueueSuite]
     suites_list = []
     for test_class in test_classes:
         suite = loader.loadTestsFromTestCase(test_class)