Skip to content

Commit caa5028

Browse files
NiallEgansusodapop
authored andcommitted
Improve deserialisation in client
Improved the deserialisation performance in the client. We've gone from > 10x worse performance to something like this: I think it would also be nice to add some flags: - `nulls_as_nans` to skip the conversion from NaNs to Nones in the dataframe - `pandas_datetimes` to leave the datetimes as native Pandas datetime dtype - `decimals_as_floats` to not convert the decimals to floats in the dataframe\ Exisiting integration tests
1 parent 236f88c commit caa5028

File tree

2 files changed

+18
-25
lines changed

2 files changed

+18
-25
lines changed

cmdexec/clients/python/src/databricks/sql/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ def __eq__(self, other):
99
return other in self.values
1010

1111
def __repr__(self):
12-
return "DBAPITypeObject(%s)" % self.values
12+
return "DBAPITypeObject({values})".format(values=self.values)
1313

1414

1515
STRING = _DBAPITypeObject('string')

cmdexec/clients/python/src/databricks/sql/client.py

Lines changed: 17 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from typing import Dict, Tuple, List, Optional, Any
1111

1212
import grpc
13+
import pandas
1314
import pyarrow
1415

1516
from databricks.sql.errors import OperationalError, InterfaceError, DatabaseError, Error, DataError
@@ -26,8 +27,9 @@
2627

2728

2829
def _parse_timestamp(value):
29-
if type(value) is datetime.datetime:
30-
# The cmd exec server will return a datetime.datetime, so no further parsing is needed
30+
if type(value) is datetime.datetime or type(value) is pandas.Timestamp:
31+
# The cmd exec server will return a native datetime / timestamp, so no further parsing is
32+
# needed
3133
return value
3234
elif value:
3335
match = _TIMESTAMP_PATTERN.match(value)
@@ -38,20 +40,13 @@ def _parse_timestamp(value):
3840
value = match.group()
3941
else:
4042
format = '%Y-%m-%d %H:%M:%S'
41-
value = datetime.datetime.strptime(value, format)
42-
return value
43+
return pandas.to_datetime(datetime.datetime.strptime(value, format))
4344
else:
4445
raise Exception('Cannot convert "{}" into a datetime'.format(value))
4546
else:
4647
return None
4748

4849

49-
TYPES_CONVERTER = {
50-
"decimal": Decimal,
51-
"timestamp": _parse_timestamp,
52-
}
53-
54-
5550
class Connection:
5651
def __init__(self,
5752
server_hostname: str,
@@ -579,21 +574,19 @@ def _fill_results_buffer(self):
579574
self.has_more_rows = has_more_rows
580575
self.description = description
581576

582-
@staticmethod
583-
def parse_type(type_, value):
584-
converter = TYPES_CONVERTER.get(type_)
585-
if converter:
586-
return converter(value)
587-
else:
588-
return value
589-
590577
def _convert_arrow_table(self, table):
591-
n_rows, _ = table.shape
592-
list_repr = [[
593-
self.parse_type(self.description[col_index][1], col[row_index].as_py())
594-
for col_index, col in enumerate(table.itercolumns())
595-
] for row_index in range(n_rows)]
596-
return list_repr
578+
df = table.to_pandas()
579+
for (i, col) in enumerate(df.columns):
580+
# Check for 0 because .dt doesn't work on empty series
581+
if self.description[i][1] == 'timestamp' and len(df) > 0:
582+
# We store the dtype as object so we don't use the pandas datetime dtype but
583+
# a native datetime.datetime
584+
timestamp_col = df[col].apply(_parse_timestamp)
585+
df[col] = pandas.Series(timestamp_col.dt.to_pydatetime(), dtype='object')
586+
elif self.description[i][1] == 'decimal':
587+
df[col] = pandas.Series(df[col].apply(Decimal), dtype='object')
588+
# Replace NaNs with None to maintain backwards compatibility
589+
return df.where(pandas.notnull(df), None).values.tolist()
597590

598591
def fetchmany_arrow(self, n_rows: int) -> pyarrow.Table:
599592
"""

0 commit comments

Comments
 (0)