Skip to content

Commit 910bb5c

Browse files
author
Jesse
authored
SQLAlchemy 2: Stop skipping all type tests (#242)
Signed-off-by: Jesse Whitehouse <jesse.whitehouse@databricks.com>
1 parent efc0337 commit 910bb5c

File tree

7 files changed

+376
-560
lines changed

7 files changed

+376
-560
lines changed

CONTRIBUTING.md

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -148,16 +148,15 @@ The suites marked `[not documented]` require additional configuration which will
148148

149149
SQLAlchemy provides reusable tests for testing dialect implementations.
150150

151-
To run these tests, assuming the environment variables needed for e2e tests are set, do the following:
152-
153151
```
154-
cd src/databricks/sqlalchemy
155-
poetry run python -m pytest test/sqlalchemy_dialect_compliance.py --dburi \
152+
poetry shell
153+
cd src/databricks/sqlalchemy/test
154+
python -m pytest test_suite.py --dburi \
156155
"databricks://token:$access_token@$host?http_path=$http_path&catalog=$catalog&schema=$schema"
157156
```
158157

159-
Some of these of these tests fail currently. We're working on getting
160-
relavent tests passing and others skipped.
158+
Some of these of these tests fail currently. We're working on getting relevant tests passing and others skipped. The tests that we've already reviewed and verified
159+
are decorated with a pytest marker called `reviewed`. To only run these tests and check for regressions, you can add `-m reviewed` to the invocation command above.
161160

162161
### Code formatting
163162

src/databricks/sqlalchemy/__init__.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from databricks import sql
1414

1515
# This import is required to process our @compiles decorators
16-
import databricks.sqlalchemy.types
16+
import databricks.sqlalchemy._types as dialect_type_impl
1717

1818

1919
from databricks.sqlalchemy.base import (
@@ -48,6 +48,12 @@ class DatabricksDialect(default.DefaultDialect):
4848
non_native_boolean_check_constraint: bool = False
4949
paramstyle: str = "named"
5050

51+
colspecs = {
52+
sqlalchemy.types.DateTime: dialect_type_impl.DatabricksDateTimeNoTimezoneType,
53+
sqlalchemy.types.Time: dialect_type_impl.DatabricksTimeType,
54+
sqlalchemy.types.String: dialect_type_impl.DatabricksStringType,
55+
}
56+
5157
@classmethod
5258
def dbapi(cls):
5359
return sql
@@ -130,7 +136,6 @@ def get_columns(self, connection, table_name, schema=None, **kwargs):
130136
columns = []
131137

132138
for col in resp:
133-
134139
# Taken from PyHive. This removes added type info from decimals and maps
135140
_col_type = re.search(r"^\w+", col.TYPE_NAME).group(0)
136141
this_column = {
Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
import sqlalchemy
2+
from sqlalchemy.ext.compiler import compiles
3+
4+
from typing import Union
5+
6+
from datetime import datetime, time
7+
8+
9+
from databricks.sql.utils import ParamEscaper
10+
11+
12+
@compiles(sqlalchemy.types.Enum, "databricks")
13+
@compiles(sqlalchemy.types.String, "databricks")
14+
@compiles(sqlalchemy.types.Text, "databricks")
15+
@compiles(sqlalchemy.types.Time, "databricks")
16+
@compiles(sqlalchemy.types.Unicode, "databricks")
17+
@compiles(sqlalchemy.types.UnicodeText, "databricks")
18+
@compiles(sqlalchemy.types.Uuid, "databricks")
19+
def compile_string_databricks(type_, compiler, **kw):
20+
"""
21+
We override the default compilation for Enum(), String(), Text(), and Time() because SQLAlchemy
22+
defaults to incompatible / abnormal compiled names
23+
24+
Enum -> VARCHAR
25+
String -> VARCHAR[LENGTH]
26+
Text -> VARCHAR[LENGTH]
27+
Time -> TIME
28+
Unicode -> VARCHAR[LENGTH]
29+
UnicodeText -> TEXT
30+
Uuid -> CHAR[32]
31+
32+
But all of these types will be compiled to STRING in Databricks SQL
33+
"""
34+
return "STRING"
35+
36+
37+
@compiles(sqlalchemy.types.Integer, "databricks")
38+
def compile_integer_databricks(type_, compiler, **kw):
39+
"""
40+
We need to override the default Integer compilation rendering because Databricks uses "INT" instead of "INTEGER"
41+
"""
42+
return "INT"
43+
44+
45+
@compiles(sqlalchemy.types.LargeBinary, "databricks")
46+
def compile_binary_databricks(type_, compiler, **kw):
47+
"""
48+
We need to override the default LargeBinary compilation rendering because Databricks uses "BINARY" instead of "BLOB"
49+
"""
50+
return "BINARY"
51+
52+
53+
@compiles(sqlalchemy.types.Numeric, "databricks")
54+
def compile_numeric_databricks(type_, compiler, **kw):
55+
"""
56+
We need to override the default Numeric compilation rendering because Databricks uses "DECIMAL" instead of "NUMERIC"
57+
58+
The built-in visit_DECIMAL behaviour captures the precision and scale. Here we're just mapping calls to compile Numeric
59+
to the SQLAlchemy Decimal() implementation
60+
"""
61+
return compiler.visit_DECIMAL(type_, **kw)
62+
63+
64+
@compiles(sqlalchemy.types.DateTime, "databricks")
65+
def compile_datetime_databricks(type_, compiler, **kw):
66+
"""
67+
We need to override the default DateTime compilation rendering because Databricks uses "TIMESTAMP" instead of "DATETIME"
68+
"""
69+
return "TIMESTAMP_NTZ"
70+
71+
72+
@compiles(sqlalchemy.types.ARRAY, "databricks")
73+
def compile_array_databricks(type_, compiler, **kw):
74+
"""
75+
SQLAlchemy's default ARRAY can't compile as it's only implemented for Postgresql.
76+
The Postgres implementation works for Databricks SQL, so we duplicate that here.
77+
78+
:type_:
79+
This is an instance of sqlalchemy.types.ARRAY which always includes an item_type attribute
80+
which is itself an instance of TypeEngine
81+
82+
https://docs.sqlalchemy.org/en/20/core/type_basics.html#sqlalchemy.types.ARRAY
83+
"""
84+
85+
inner = compiler.process(type_.item_type, **kw)
86+
87+
return f"ARRAY<{inner}>"
88+
89+
90+
class DatabricksDateTimeNoTimezoneType(sqlalchemy.types.TypeDecorator):
91+
"""The decimal that pysql creates when it receives the contents of a TIMESTAMP_NTZ
92+
includes a timezone of 'Etc/UTC'. But since SQLAlchemy's test suite assumes that
93+
the sqlalchemy.types.DateTime type will return a datetime.datetime _without_ any
94+
timezone set, we need to strip the timezone off the value received from pysql.
95+
96+
It's not clear if DBR sends a timezone to pysql or if pysql is adding it. This could be a bug.
97+
"""
98+
99+
impl = sqlalchemy.types.DateTime
100+
101+
cache_ok = True
102+
103+
def process_result_value(self, value: Union[None, datetime], dialect):
104+
if value is None:
105+
return None
106+
return value.replace(tzinfo=None)
107+
108+
109+
class DatabricksTimeType(sqlalchemy.types.TypeDecorator):
110+
"""Databricks has no native TIME type. So we store it as a string."""
111+
112+
impl = sqlalchemy.types.Time
113+
cache_ok = True
114+
115+
TIME_WITH_MICROSECONDS_FMT = "%H:%M:%S.%f"
116+
TIME_NO_MICROSECONDS_FMT = "%H:%M:%S"
117+
118+
def process_bind_param(self, value: Union[time, None], dialect) -> Union[None, str]:
119+
"""Values sent to the database are converted to %:H:%M:%S strings."""
120+
if value is None:
121+
return None
122+
return value.strftime(self.TIME_WITH_MICROSECONDS_FMT)
123+
124+
# mypy doesn't like this workaround because TypeEngine wants process_literal_param to return a string
125+
def process_literal_param(self, value, dialect) -> time: # type: ignore
126+
"""It's not clear to me why this is necessary. Without it, SQLAlchemy's Timetest:test_literal fails
127+
because the string literal renderer receives a str() object and calls .isoformat() on it.
128+
129+
Whereas this method receives a datetime.time() object which is subsequently passed to that
130+
same renderer. And that works.
131+
132+
UPDATE: After coping with the literal_processor override in DatabricksStringType, I suspect a similar
133+
mechanism is at play. Two different processors are are called in sequence. This is likely a byproduct
134+
of Databricks not having a true TIME type. I think the string representation of Time() types is
135+
somehow affecting the literal rendering process. But as long as this passes the tests, I'm not
136+
worried about it.
137+
"""
138+
return value
139+
140+
def process_result_value(
141+
self, value: Union[None, str], dialect
142+
) -> Union[time, None]:
143+
"""Values received from the database are parsed into datetime.time() objects"""
144+
if value is None:
145+
return None
146+
147+
try:
148+
_parsed = datetime.strptime(value, self.TIME_WITH_MICROSECONDS_FMT)
149+
except ValueError:
150+
# If the string doesn't have microseconds, try parsing it without them
151+
_parsed = datetime.strptime(value, self.TIME_NO_MICROSECONDS_FMT)
152+
153+
return _parsed.time()
154+
155+
156+
class DatabricksStringType(sqlalchemy.types.TypeDecorator):
157+
"""We have to implement our own String() type because SQLAlchemy's default implementation
158+
wants to escape single-quotes with a doubled single-quote. Databricks uses a backslash for
159+
escaping of literal strings. And SQLAlchemy's default escaping breaks Databricks SQL.
160+
"""
161+
162+
impl = sqlalchemy.types.String
163+
cache_ok = True
164+
pe = ParamEscaper()
165+
166+
def process_literal_param(self, value, dialect) -> str:
167+
"""SQLAlchemy's default string escaping for backslashes doesn't work for databricks. The logic here
168+
implements the same logic as our legacy inline escaping logic.
169+
"""
170+
171+
return self.pe.escape_string(value)
172+
173+
def literal_processor(self, dialect):
174+
"""We manually override this method to prevent further processing of the string literal beyond
175+
what happens in the process_literal_param() method.
176+
177+
The SQLAlchemy docs _specifically_ say to not override this method.
178+
179+
It appears that any processing that happens from TypeEngine.process_literal_param happens _before_
180+
and _in addition to_ whatever the class's impl.literal_processor() method does. The String.literal_processor()
181+
method performs a string replacement that doubles any single-quote in the contained string. This raises a syntax
182+
error in Databricks. And it's not necessary because ParamEscaper() already implements all the escaping we need.
183+
184+
We should consider opening an issue on the SQLAlchemy project to see if I'm using it wrong.
185+
186+
See type_api.py::TypeEngine.literal_processor:
187+
188+
```python
189+
def process(value: Any) -> str:
190+
return fixed_impl_processor(
191+
fixed_process_literal_param(value, dialect)
192+
)
193+
```
194+
195+
That call to fixed_impl_processor wraps the result of fixed_process_literal_param (which is the
196+
process_literal_param defined in our Databricks dialect)
197+
198+
https://docs.sqlalchemy.org/en/20/core/custom_types.html#sqlalchemy.types.TypeDecorator.literal_processor
199+
"""
200+
201+
def process(value):
202+
"""This is a copy of the default String.literal_processor() method but stripping away
203+
its double-escaping behaviour for single-quotes.
204+
"""
205+
206+
_step1 = self.process_literal_param(value, dialect="databricks")
207+
if dialect.identifier_preparer._double_percents:
208+
_step2 = _step1.replace("%", "%%")
209+
else:
210+
_step2 = _step1
211+
212+
return "%s" % _step2
213+
214+
return process
Lines changed: 83 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,96 @@
11
"""
2-
This module is supposedly used by the compliance tests to control which tests are run based on database capabilities.
3-
However, based on some experimentation that does not appear to be consistently the case. Until we better understand
4-
when these requirements are and are not implemented, we prefer to manually capture the exact nature of the failures
5-
and errors.
6-
7-
Once we better understand how to use requirements.py, an example exclusion will look like this:
8-
9-
import sqlalchemy.testing.requirements
10-
import sqlalchemy.testing.exclusions
11-
12-
class Requirements(sqlalchemy.testing.requirements.SuiteRequirements):
13-
@property
14-
def __some_example_requirement(self):
15-
return sqlalchemy.testing.exclusions.closed
16-
17-
182
The complete list of requirements is provided by SQLAlchemy here:
193
204
https://github.com/sqlalchemy/sqlalchemy/blob/main/lib/sqlalchemy/testing/requirements.py
5+
6+
When SQLAlchemy skips a test because a requirement is closed() it gives a generic skip message.
7+
To make these failures more actionable, we only define requirements in this file that we wish to
8+
force to be open(). If a test should be skipped on Databricks, it will be specifically marked skip
9+
in test_suite.py with a Databricks-specific reason.
10+
11+
See the special note about the array_type exclusion below.
2112
"""
2213

2314
import sqlalchemy.testing.requirements
2415
import sqlalchemy.testing.exclusions
2516

26-
import logging
2717

28-
logger = logging.getLogger(__name__)
18+
class Requirements(sqlalchemy.testing.requirements.SuiteRequirements):
19+
@property
20+
def date_historic(self):
21+
"""target dialect supports representation of Python
22+
datetime.datetime() objects with historic (pre 1970) values."""
2923

30-
logger.warning("requirements.py is not currently employed by Databricks dialect")
24+
return sqlalchemy.testing.exclusions.open()
3125

26+
@property
27+
def datetime_historic(self):
28+
"""target dialect supports representation of Python
29+
datetime.datetime() objects with historic (pre 1970) values."""
3230

33-
class Requirements(sqlalchemy.testing.requirements.SuiteRequirements):
34-
pass
31+
return sqlalchemy.testing.exclusions.open()
32+
33+
@property
34+
def datetime_literals(self):
35+
"""target dialect supports rendering of a date, time, or datetime as a
36+
literal string, e.g. via the TypeEngine.literal_processor() method.
37+
38+
"""
39+
40+
return sqlalchemy.testing.exclusions.open()
41+
42+
@property
43+
def timestamp_microseconds(self):
44+
"""target dialect supports representation of Python
45+
datetime.datetime() with microsecond objects but only
46+
if TIMESTAMP is used."""
47+
48+
return sqlalchemy.testing.exclusions.open()
49+
50+
@property
51+
def time_microseconds(self):
52+
"""target dialect supports representation of Python
53+
datetime.time() with microsecond objects.
54+
55+
This requirement declaration isn't needed but I've included it here for completeness.
56+
Since Databricks doesn't have a TIME type, SQLAlchemy will compile Time() columns
57+
as STRING Databricks data types. And we use a custom time type to render those strings
58+
between str() and time.time() representations. Therefore we can store _any_ precision
59+
that SQLAlchemy needs. The time_microseconds requirement defaults to ON for all dialects
60+
except mssql, mysql, mariadb, and oracle.
61+
"""
62+
63+
return sqlalchemy.testing.exclusions.open()
64+
65+
@property
66+
def infinity_floats(self):
67+
"""The Float type can persist and load float('inf'), float('-inf')."""
68+
69+
return sqlalchemy.testing.exclusions.open()
70+
71+
@property
72+
def precision_numerics_retains_significant_digits(self):
73+
"""A precision numeric type will return empty significant digits,
74+
i.e. a value such as 10.000 will come back in Decimal form with
75+
the .000 maintained."""
76+
77+
return sqlalchemy.testing.exclusions.open()
78+
79+
@property
80+
def precision_numerics_many_significant_digits(self):
81+
"""target backend supports values with many digits on both sides,
82+
such as 319438950232418390.273596, 87673.594069654243
83+
84+
"""
85+
return sqlalchemy.testing.exclusions.open()
86+
87+
@property
88+
def array_type(self):
89+
"""While Databricks does support ARRAY types, pysql cannot bind them. So
90+
we cannot use them with SQLAlchemy
91+
92+
Due to a bug in SQLAlchemy, we _must_ define this exclusion as closed() here or else the
93+
test runner will crash the pytest process due to an AttributeError
94+
"""
95+
96+
return sqlalchemy.testing.exclusions.closed()

0 commit comments

Comments
 (0)