Improve deserialisation in client

NiallEgan · susodapop · commit caa5028cacbc · 2022-06-02T11:14:15.000-05:00
Improved the deserialisation performance in the client. We've gone from &gt; 10x worse performance to something like this:

I think it would also be nice to add some flags:
- `nulls_as_nans` to skip the conversion from NaNs to Nones in the dataframe
- `pandas_datetimes` to leave the datetimes as native Pandas datetime dtype
- `decimals_as_floats` to not convert the decimals to floats in the dataframe\

Exisiting integration tests
diff --git a/cmdexec/clients/python/src/databricks/sql/__init__.py b/cmdexec/clients/python/src/databricks/sql/__init__.py
@@ -9,7 +9,7 @@ def __eq__(self, other):
         return other in self.values
 
     def __repr__(self):
-        return "DBAPITypeObject(%s)" % self.values
+        return "DBAPITypeObject({values})".format(values=self.values)
 
 
 STRING = _DBAPITypeObject('string')
diff --git a/cmdexec/clients/python/src/databricks/sql/client.py b/cmdexec/clients/python/src/databricks/sql/client.py
@@ -10,6 +10,7 @@
 from typing import Dict, Tuple, List, Optional, Any
 
 import grpc
+import pandas
 import pyarrow
 
 from databricks.sql.errors import OperationalError, InterfaceError, DatabaseError, Error, DataError
@@ -26,8 +27,9 @@
 
 
 def _parse_timestamp(value):
-    if type(value) is datetime.datetime:
-        # The cmd exec server will return a datetime.datetime, so no further parsing is needed
+    if type(value) is datetime.datetime or type(value) is pandas.Timestamp:
+        # The cmd exec server will return a native datetime / timestamp, so no further parsing is
+        # needed
         return value
     elif value:
         match = _TIMESTAMP_PATTERN.match(value)
@@ -38,20 +40,13 @@ def _parse_timestamp(value):
                 value = match.group()
             else:
                 format = '%Y-%m-%d %H:%M:%S'
-            value = datetime.datetime.strptime(value, format)
-            return value
+            return pandas.to_datetime(datetime.datetime.strptime(value, format))
         else:
             raise Exception('Cannot convert "{}" into a datetime'.format(value))
     else:
         return None
 
 
-TYPES_CONVERTER = {
-    "decimal": Decimal,
-    "timestamp": _parse_timestamp,
-}
-
-
 class Connection:
     def __init__(self,
                  server_hostname: str,
@@ -579,21 +574,19 @@ def _fill_results_buffer(self):
             self.has_more_rows = has_more_rows
             self.description = description
 
-    @staticmethod
-    def parse_type(type_, value):
-        converter = TYPES_CONVERTER.get(type_)
-        if converter:
-            return converter(value)
-        else:
-            return value
-
     def _convert_arrow_table(self, table):
-        n_rows, _ = table.shape
-        list_repr = [[
-            self.parse_type(self.description[col_index][1], col[row_index].as_py())
-            for col_index, col in enumerate(table.itercolumns())
-        ] for row_index in range(n_rows)]
-        return list_repr
+        df = table.to_pandas()
+        for (i, col) in enumerate(df.columns):
+            # Check for 0 because .dt doesn't work on empty series
+            if self.description[i][1] == 'timestamp' and len(df) > 0:
+                # We store the dtype as object so we don't use the pandas datetime dtype but
+                # a native datetime.datetime
+                timestamp_col = df[col].apply(_parse_timestamp)
+                df[col] = pandas.Series(timestamp_col.dt.to_pydatetime(), dtype='object')
+            elif self.description[i][1] == 'decimal':
+                df[col] = pandas.Series(df[col].apply(Decimal), dtype='object')
+        # Replace NaNs with None to maintain backwards compatibility
+        return df.where(pandas.notnull(df), None).values.tolist()
 
     def fetchmany_arrow(self, n_rows: int) -> pyarrow.Table:
         """