Client-side Decimal deserialisation

NiallEgan · susodapop · commit 76695ea4dc1a · 2022-06-02T11:14:16.000-05:00
This PR adds client-side deserialisation of Decimals

- New unit tests
- New integration tests
diff --git a/cmdexec/clients/python/setup.py b/cmdexec/clients/python/setup.py
@@ -5,6 +5,6 @@
     version="0.0.0",
     package_dir={"": "src"},
     packages=setuptools.find_packages(where="src"),
-    install_requires=["pyarrow", 'thrift>=0.10.0'],
+    install_requires=["pyarrow", 'thrift>=0.10.0', "pandas"],
     author="Databricks",
 )
diff --git a/cmdexec/clients/python/src/databricks/sql/client.py b/cmdexec/clients/python/src/databricks/sql/client.py
@@ -17,10 +17,6 @@
 DEFAULT_RESULT_BUFFER_SIZE_BYTES = 10485760
 DEFAULT_ARRAY_SIZE = 100000
 
-TYPES_CONVERTER = {
-    "decimal": Decimal,
-}
-
 
 class Connection:
     def __init__(self,
@@ -440,7 +436,6 @@ def __init__(self,
         self.has_more_rows = execute_response.has_more_rows
         self.buffer_size_bytes = result_buffer_size_bytes
         self._row_index = 0
-        self.description = None
         self.arraysize = arraysize
         self.thrift_backend = thrift_backend
         self.description = execute_response.description
@@ -465,24 +460,14 @@ def __iter__(self):
     def _fill_results_buffer(self):
         results, has_more_rows = self.thrift_backend.fetch_results(
             self.command_id, self.arraysize, self.buffer_size_bytes, self._row_index,
-            self._arrow_schema)
+            self._arrow_schema, self.description)
         self.results = results
         self.has_more_rows = has_more_rows
 
-    @staticmethod
-    def parse_type(type_, value):
-        converter = TYPES_CONVERTER.get(type_)
-        if converter:
-            return value if value is None else converter(value)
-        else:
-            return value
-
     def _convert_arrow_table(self, table):
         n_rows, _ = table.shape
-        list_repr = [[
-            self.parse_type(self.description[col_index][1], col[row_index].as_py())
-            for col_index, col in enumerate(table.itercolumns())
-        ] for row_index in range(n_rows)]
+        list_repr = [[col[row_index].as_py() for col in table.itercolumns()]
+                     for row_index in range(n_rows)]
         return list_repr
 
     def fetchmany_arrow(self, n_rows: int) -> pyarrow.Table:
diff --git a/cmdexec/clients/python/src/databricks/sql/thrift_backend.py b/cmdexec/clients/python/src/databricks/sql/thrift_backend.py
@@ -1,3 +1,4 @@
+from decimal import Decimal
 import logging
 import time
 import threading
@@ -190,13 +191,32 @@ def _poll_for_status(self, op_handle):
         )
         return self.make_request(self._client.GetOperationStatus, req)
 
-    def _create_arrow_table(self, t_row_set, schema):
+    def _create_arrow_table(self, t_row_set, arrow_schema, description):
         if t_row_set.columns is not None:
-            return ThriftBackend._convert_column_based_set_to_arrow_table(t_row_set.columns, schema)
-        if t_row_set.arrowBatches is not None:
-            return ThriftBackend._convert_arrow_based_set_to_arrow_table(
-                t_row_set.arrowBatches, schema)
-        raise OperationalError("Unsupported TRowSet instance {}".format(t_row_set))
+            arrow_table, num_rows = ThriftBackend._convert_column_based_set_to_arrow_table(
+                t_row_set.columns, arrow_schema)
+        elif t_row_set.arrowBatches is not None:
+            arrow_table, num_rows = ThriftBackend._convert_arrow_based_set_to_arrow_table(
+                t_row_set.arrowBatches, arrow_schema)
+        else:
+            raise OperationalError("Unsupported TRowSet instance {}".format(t_row_set))
+        return self._convert_decimals_in_arrow_table(arrow_table, description), num_rows
+
+    @staticmethod
+    def _convert_decimals_in_arrow_table(table, description):
+        for (i, col) in enumerate(table.itercolumns()):
+            if description[i][1] == 'decimal':
+                decimal_col = col.to_pandas().apply(lambda v: v if v is None else Decimal(v))
+                precision, scale = description[i][4], description[i][5]
+                assert scale is not None
+                assert precision is not None
+                # Spark limits decimal to a maximum scale of 38,
+                # so 128 is guaranteed to be big enough
+                dtype = pyarrow.decimal128(precision, scale)
+                col_data = pyarrow.array(decimal_col, type=dtype)
+                field = table.field(i).with_type(dtype)
+                table = table.set_column(i, field, col_data)
+        return table
 
     @staticmethod
     def _convert_arrow_based_set_to_arrow_table(arrow_batches, schema):
@@ -304,17 +324,32 @@ def convert_col(t_column_desc):
         return pyarrow.schema([convert_col(col) for col in t_table_schema.columns])
 
     @staticmethod
-    def _hive_schema_to_description(t_table_schema):
-        def clean_type(typeEntry):
-            if typeEntry.primitiveEntry:
-                name = ttypes.TTypeId._VALUES_TO_NAMES[typeEntry.primitiveEntry.type]
-                # Drop _TYPE suffix
-                return (name[:-5] if name.endswith("_TYPE") else name).lower()
+    def _col_to_description(col):
+        type_entry = col.typeDesc.types[0]
+
+        if type_entry.primitiveEntry:
+            name = ttypes.TTypeId._VALUES_TO_NAMES[type_entry.primitiveEntry.type]
+            # Drop _TYPE suffix
+            cleaned_type = (name[:-5] if name.endswith("_TYPE") else name).lower()
+        else:
+            raise OperationalError("Thrift protocol error: t_type_entry not a primitiveEntry")
+
+        if type_entry.primitiveEntry.type == ttypes.TTypeId.DECIMAL_TYPE:
+            qualifiers = type_entry.primitiveEntry.typeQualifiers.qualifiers
+            if qualifiers and "precision" in qualifiers and "scale" in qualifiers:
+                precision, scale = qualifiers["precision"].i32Value, qualifiers["scale"].i32Value
             else:
-                raise OperationalError("Thrift protocol error: t_type_entry not a primitiveEntry")
+                raise OperationalError(
+                    "Decimal type did not provide typeQualifier precision, scale in "
+                    "primitiveEntry {}".format(type_entry.primitiveEntry))
+        else:
+            precision, scale = None, None
 
-        return [(col.columnName, clean_type(col.typeDesc.types[0]), None, None, None, None, None)
-                for col in t_table_schema.columns]
+        return col.columnName, cleaned_type, None, None, precision, scale, None
+
+    @staticmethod
+    def _hive_schema_to_description(t_table_schema):
+        return [ThriftBackend._col_to_description(col) for col in t_table_schema.columns]
 
     def _results_message_to_execute_response(self, resp, operation_state):
         if resp.directResults and resp.directResults.resultSetMetadata:
@@ -341,7 +376,7 @@ def _results_message_to_execute_response(self, resp, operation_state):
             assert (direct_results.resultSet.results.startRowOffset == 0)
             assert (direct_results.resultSetMetadata)
             arrow_results, n_rows = self._create_arrow_table(direct_results.resultSet.results,
-                                                             arrow_schema)
+                                                             arrow_schema, description)
             arrow_queue_opt = ArrowQueue(arrow_results, n_rows, 0)
         else:
             arrow_queue_opt = None
@@ -477,7 +512,7 @@ def _handle_execute_response(self, resp, cursor):
 
         return self._results_message_to_execute_response(resp, final_operation_state)
 
-    def fetch_results(self, op_handle, max_rows, max_bytes, row_offset, arrow_schema):
+    def fetch_results(self, op_handle, max_rows, max_bytes, row_offset, arrow_schema, description):
         assert (op_handle is not None)
 
         req = ttypes.TFetchResultsReq(
@@ -493,7 +528,7 @@ def fetch_results(self, op_handle, max_rows, max_bytes, row_offset, arrow_schema
         )
 
         resp = self.make_request(self._client.FetchResults, req)
-        arrow_results, n_rows = self._create_arrow_table(resp.results, arrow_schema)
+        arrow_results, n_rows = self._create_arrow_table(resp.results, arrow_schema, description)
         arrow_queue = ArrowQueue(arrow_results, n_rows, row_offset - resp.results.startRowOffset)
 
         return arrow_queue, resp.hasMoreRows
diff --git a/cmdexec/clients/python/test-container-with-reqs.dockerfile b/cmdexec/clients/python/test-container-with-reqs.dockerfile
@@ -4,6 +4,7 @@ RUN pip install grpcio==1.41.0 \
                 pyarrow==5.0.0 \
                 protobuf==3.18.1 \
                 cryptography==35.0.0 \
-                thrift==0.13.0
+                thrift==0.13.0 \
+                pandas==1.3.4
 
 ENTRYPOINT ["./docker-entrypoint.sh"]
diff --git a/cmdexec/clients/python/tests/test_fetches.py b/cmdexec/clients/python/tests/test_fetches.py
@@ -51,7 +51,7 @@ def make_dummy_result_set_from_initial_results(initial_results):
     def make_dummy_result_set_from_batch_list(batch_list):
         batch_index = 0
 
-        def fetch_results(op_handle, max_rows, max_bytes, row_offset, arrow_schema):
+        def fetch_results(op_handle, max_rows, max_bytes, row_offset, arrow_schema, description):
             nonlocal batch_index
             results = FetchTests.make_arrow_queue(batch_list[batch_index])
             batch_index += 1
diff --git a/cmdexec/clients/python/tests/test_thrift_backend.py b/cmdexec/clients/python/tests/test_thrift_backend.py
@@ -1,15 +1,17 @@
+from collections import OrderedDict
+from decimal import Decimal
+import itertools
 import unittest
 from unittest.mock import patch, MagicMock, Mock
-import itertools
+
 import pyarrow
 
 from databricks.sql.thrift_api.TCLIService import ttypes
-
 from databricks.sql import *
 from databricks.sql.thrift_backend import ThriftBackend
 
 
-class TestThriftBackend(unittest.TestCase):
+class ThriftBackendTestSuite(unittest.TestCase):
     okay_status = ttypes.TStatus(statusCode=ttypes.TStatusCode.SUCCESS_STATUS)
 
     bad_status = ttypes.TStatus(
@@ -55,7 +57,7 @@ def _make_fake_thrift_backend(self):
         thrift_backend._hive_schema_to_arrow_schema = Mock()
         thrift_backend._hive_schema_to_description = Mock()
         thrift_backend._create_arrow_table = MagicMock()
-        thrift_backend._create_arrow_table.return_value = (Mock(), Mock())
+        thrift_backend._create_arrow_table.return_value = (MagicMock(), Mock())
         return thrift_backend
 
     def test_hive_schema_to_arrow_schema_preserves_column_names(self):
@@ -173,6 +175,28 @@ def test_hive_schema_to_description_preserves_column_names_and_types(self):
             ("", "struct", None, None, None, None, None),
         ])
 
+    def test_hive_schema_to_description_preserves_scale_and_precision(self):
+        columns = [
+            ttypes.TColumnDesc(
+                columnName="column 1",
+                typeDesc=ttypes.TTypeDesc(types=[
+                    ttypes.TTypeEntry(
+                        ttypes.TPrimitiveTypeEntry(
+                            type=ttypes.TTypeId.DECIMAL_TYPE,
+                            typeQualifiers=ttypes.TTypeQualifiers(
+                                qualifiers={
+                                    "precision": ttypes.TTypeQualifierValue(i32Value=10),
+                                    "scale": ttypes.TTypeQualifierValue(i32Value=100),
+                                })))
+                ])),
+        ]
+        t_table_schema = ttypes.TTableSchema(columns)
+
+        description = ThriftBackend._hive_schema_to_description(t_table_schema)
+        self.assertEqual(description, [
+            ("column 1", "decimal", None, None, 10, 100, None),
+        ])
+
     def test_make_request_checks_status_code(self):
         error_codes = [ttypes.TStatusCode.ERROR_STATUS, ttypes.TStatusCode.INVALID_HANDLE_STATUS]
         thrift_backend = ThriftBackend("foo", 123, "bar", [])
@@ -390,7 +414,7 @@ def test_handle_execute_response_reads_has_more_rows_in_result_response(
                                                           self.execute_response_types):
             with self.subTest(has_more_rows=has_more_rows, resp_type=resp_type):
                 tcli_service_instance = tcli_service_class.return_value
-                results_mock = Mock()
+                results_mock = MagicMock()
                 results_mock.startRowOffset = 0
 
                 execute_resp = resp_type(
@@ -415,7 +439,8 @@ def test_handle_execute_response_reads_has_more_rows_in_result_response(
                 thrift_backend = self._make_fake_thrift_backend()
 
                 thrift_backend._handle_execute_response(execute_resp, Mock())
-                _, has_more_rows_resp = thrift_backend.fetch_results(Mock(), 1, 1, 0, Mock())
+                _, has_more_rows_resp = thrift_backend.fetch_results(Mock(), 1, 1, 0, Mock(),
+                                                                     Mock())
 
                 self.assertEqual(has_more_rows, has_more_rows_resp)
 
@@ -603,25 +628,27 @@ def test_create_arrow_table_raises_error_for_unsupported_type(self):
         t_row_set = ttypes.TRowSet()
         thrift_backend = ThriftBackend("foobar", 443, "path", [])
         with self.assertRaises(OperationalError):
-            thrift_backend._create_arrow_table(t_row_set, None)
+            thrift_backend._create_arrow_table(t_row_set, None, Mock())
 
     @patch.object(ThriftBackend, "_convert_arrow_based_set_to_arrow_table")
     @patch.object(ThriftBackend, "_convert_column_based_set_to_arrow_table")
     def test_create_arrow_table_calls_correct_conversion_method(self, convert_col_mock,
                                                                 convert_arrow_mock):
         thrift_backend = ThriftBackend("foobar", 443, "path", [])
+        convert_arrow_mock.return_value = (MagicMock(), Mock())
+        convert_col_mock.return_value = (MagicMock(), Mock())
 
         schema = Mock()
         cols = Mock()
         arrow_batches = Mock()
 
         t_col_set = ttypes.TRowSet(columns=cols)
-        thrift_backend._create_arrow_table(t_col_set, schema)
+        thrift_backend._create_arrow_table(t_col_set, schema, Mock())
         convert_arrow_mock.assert_not_called()
         convert_col_mock.assert_called_once_with(cols, schema)
 
         t_arrow_set = ttypes.TRowSet(arrowBatches=arrow_batches)
-        thrift_backend._create_arrow_table(t_arrow_set, schema)
+        thrift_backend._create_arrow_table(t_arrow_set, schema, Mock())
         convert_arrow_mock.assert_called_once_with(arrow_batches, schema)
         convert_col_mock.assert_called_once_with(cols, schema)
 
@@ -818,6 +845,60 @@ def test_make_request_will_read_X_Thriftserver_Error_Message_if_set(self, t_tran
 
         self.assertEqual(mock_method.call_count, 13 + 1)
 
+    @staticmethod
+    def make_table_and_desc(height, n_decimal_cols, width, precision, scale, int_constant,
+                            decimal_constant):
+        int_col = [int_constant for _ in range(height)]
+        decimal_col = [decimal_constant for _ in range(height)]
+        data = OrderedDict({"col{}".format(i): int_col for i in range(width - n_decimal_cols)})
+        decimals = OrderedDict({"col_dec{}".format(i): decimal_col for i in range(n_decimal_cols)})
+        data.update(decimals)
+
+        int_desc = ([("", "int")] * (width - n_decimal_cols))
+        decimal_desc = ([("", "decimal", None, None, precision, scale, None)] * n_decimal_cols)
+        description = int_desc + decimal_desc
+
+        table = pyarrow.Table.from_pydict(data)
+        return table, description
+
+    def test_arrow_decimal_conversion(self):
+        # Broader tests in DecimalTestSuite
+        width = 10
+        int_constant = 12345
+        precision, scale = 10, 5
+        decimal_constant = "1.345"
+
+        for n_decimal_cols in [0, 1, 10]:
+            for height in [0, 1, 10]:
+                with self.subTest(n_decimal_cols=n_decimal_cols, height=height):
+                    table, description = self.make_table_and_desc(height, n_decimal_cols, width,
+                                                                  precision, scale, int_constant,
+                                                                  decimal_constant)
+                    decimal_converted_table = ThriftBackend._convert_decimals_in_arrow_table(
+                        table, description)
+
+                    for i in range(width):
+                        if height > 0:
+                            if i < width - n_decimal_cols:
+                                self.assertEqual(
+                                    decimal_converted_table.field(i).type, pyarrow.int64())
+                            else:
+                                self.assertEqual(
+                                    decimal_converted_table.field(i).type,
+                                    pyarrow.decimal128(precision=precision, scale=scale))
+
+                    int_col = [int_constant for _ in range(height)]
+                    decimal_col = [Decimal(decimal_constant) for _ in range(height)]
+                    expected_result = OrderedDict(
+                        {"col{}".format(i): int_col
+                         for i in range(width - n_decimal_cols)})
+                    decimals = OrderedDict(
+                        {"col_dec{}".format(i): decimal_col
+                         for i in range(n_decimal_cols)})
+                    expected_result.update(decimals)
+
+                    self.assertEqual(decimal_converted_table.to_pydict(), expected_result)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/cmdexec/clients/python/tests/tests.py b/cmdexec/clients/python/tests/tests.py
@@ -10,10 +10,10 @@
 from databricks.sql import InterfaceError, DatabaseError, Error
 
 from cmdexec.clients.python.tests.test_fetches import FetchTests
-from cmdexec.clients.python.tests.test_thrift_backend import TestThriftBackend
+from cmdexec.clients.python.tests.test_thrift_backend import ThriftBackendTestSuite
 
 
-class ClientTests(unittest.TestCase):
+class ClientTestSuite(unittest.TestCase):
     """
     Unit tests for isolated client behaviour. See
     qa/test/cmdexec/python/suites/simple_connection_test.py for integration tests that
@@ -332,23 +332,10 @@ def test_max_number_of_retries_passthrough(self, mock_client_class):
         self.assertEqual(mock_client_class.call_args[1]["_max_number_of_retries"], 53)
 
 
-class ResultSetTests(unittest.TestCase):
-    def test_parse_type_converts_decimal(self):
-        for input in [None, 0, "0", 5, "5", 2.33, "2.33"]:
-            with self.subTest(input=input):
-                res = client.ResultSet.parse_type("decimal", input)
-                if input != None:
-                    self.assertEqual(type(res), Decimal)
-                    self.assertEqual(res, Decimal(input))
-                else:
-                    self.assertEqual(type(res), type(None))
-                    self.assertEqual(res, None)
-
-
 if __name__ == '__main__':
     suite = unittest.TestLoader().loadTestsFromModule(sys.modules[__name__])
     loader = unittest.TestLoader()
-    test_classes = [ClientTests, ResultSetTests, FetchTests, TestThriftBackend]
+    test_classes = [ClientTestSuite, FetchTests, ThriftBackendTestSuite]
     suites_list = []
     for test_class in test_classes:
         suite = loader.loadTestsFromTestCase(test_class)

Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,6 @@`
`5`	`5`	`version="0.0.0",`
`6`	`6`	`package_dir={"": "src"},`
`7`	`7`	`packages=setuptools.find_packages(where="src"),`
`8`		`- install_requires=["pyarrow", 'thrift>=0.10.0'],`
	`8`	`+ install_requires=["pyarrow", 'thrift>=0.10.0', "pandas"],`
`9`	`9`	`author="Databricks",`
`10`	`10`	`)`