databricks
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 5 additions & 6 deletions b/‎CONTRIBUTING.md‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎src/databricks/sqlalchemy/__init__.py‎
Lines changed: 7 additions & 2 deletions b/‎src/databricks/sqlalchemy/__init__.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎src/databricks/sqlalchemy/_types.py‎
Lines changed: 214 additions & 0 deletions b/‎src/databricks/sqlalchemy/_types.py‎
Lines changed: 214 additions & 0 deletions
diff --git a/‎src/databricks/sqlalchemy/requirements.py‎
Lines changed: 83 additions & 21 deletions b/‎src/databricks/sqlalchemy/requirements.py‎
Lines changed: 83 additions & 21 deletions
@@ -148,16 +148,15 @@ The suites marked `[not documented]` require additional configuration which will
 
 SQLAlchemy provides reusable tests for testing dialect implementations.
 
-To run these tests, assuming the environment variables needed for e2e tests are set, do the following:
-
 ```
-cd src/databricks/sqlalchemy
-poetry run python -m pytest test/sqlalchemy_dialect_compliance.py --dburi \
+poetry shell
+cd src/databricks/sqlalchemy/test
+python -m pytest test_suite.py --dburi \
   "databricks://token:$access_token@$host?http_path=$http_path&catalog=$catalog&schema=$schema" 
 ```
 
-Some of these of these tests fail currently. We're working on getting
-relavent tests passing and others skipped.
+Some of these of these tests fail currently. We're working on getting relevant tests passing and others skipped. The tests that we've already reviewed and verified
+are decorated with a pytest marker called `reviewed`. To only run these tests and check for regressions, you can add `-m reviewed` to the invocation command above.
 
 ### Code formatting
 
 
@@ -13,7 +13,7 @@
 from databricks import sql
 
 # This import is required to process our @compiles decorators
-import databricks.sqlalchemy.types
+import databricks.sqlalchemy._types as dialect_type_impl
 
 
 from databricks.sqlalchemy.base import (
@@ -48,6 +48,12 @@ class DatabricksDialect(default.DefaultDialect):
     non_native_boolean_check_constraint: bool = False
     paramstyle: str = "named"
 
+    colspecs = {
+        sqlalchemy.types.DateTime: dialect_type_impl.DatabricksDateTimeNoTimezoneType,
+        sqlalchemy.types.Time: dialect_type_impl.DatabricksTimeType,
+        sqlalchemy.types.String: dialect_type_impl.DatabricksStringType,
+    }
+
     @classmethod
     def dbapi(cls):
         return sql
@@ -130,7 +136,6 @@ def get_columns(self, connection, table_name, schema=None, **kwargs):
         columns = []
 
         for col in resp:
-
             # Taken from PyHive. This removes added type info from decimals and maps
             _col_type = re.search(r"^\w+", col.TYPE_NAME).group(0)
             this_column = {
 
@@ -0,0 +1,214 @@
+import sqlalchemy
+from sqlalchemy.ext.compiler import compiles
+
+from typing import Union
+
+from datetime import datetime, time
+
+
+from databricks.sql.utils import ParamEscaper
+
+
+@compiles(sqlalchemy.types.Enum, "databricks")
+@compiles(sqlalchemy.types.String, "databricks")
+@compiles(sqlalchemy.types.Text, "databricks")
+@compiles(sqlalchemy.types.Time, "databricks")
+@compiles(sqlalchemy.types.Unicode, "databricks")
+@compiles(sqlalchemy.types.UnicodeText, "databricks")
+@compiles(sqlalchemy.types.Uuid, "databricks")
+def compile_string_databricks(type_, compiler, **kw):
+    """
+    We override the default compilation for Enum(), String(), Text(), and Time() because SQLAlchemy
+    defaults to incompatible / abnormal compiled names
+
+      Enum -> VARCHAR
+      String -> VARCHAR[LENGTH]
+      Text -> VARCHAR[LENGTH]
+      Time -> TIME
+      Unicode -> VARCHAR[LENGTH]
+      UnicodeText -> TEXT
+      Uuid -> CHAR[32]
+
+    But all of these types will be compiled to STRING in Databricks SQL
+    """
+    return "STRING"
+
+
+@compiles(sqlalchemy.types.Integer, "databricks")
+def compile_integer_databricks(type_, compiler, **kw):
+    """
+    We need to override the default Integer compilation rendering because Databricks uses "INT" instead of "INTEGER"
+    """
+    return "INT"
+
+
+@compiles(sqlalchemy.types.LargeBinary, "databricks")
+def compile_binary_databricks(type_, compiler, **kw):
+    """
+    We need to override the default LargeBinary compilation rendering because Databricks uses "BINARY" instead of "BLOB"
+    """
+    return "BINARY"
+
+
+@compiles(sqlalchemy.types.Numeric, "databricks")
+def compile_numeric_databricks(type_, compiler, **kw):
+    """
+    We need to override the default Numeric compilation rendering because Databricks uses "DECIMAL" instead of "NUMERIC"
+
+    The built-in visit_DECIMAL behaviour captures the precision and scale. Here we're just mapping calls to compile Numeric
+    to the SQLAlchemy Decimal() implementation
+    """
+    return compiler.visit_DECIMAL(type_, **kw)
+
+
+@compiles(sqlalchemy.types.DateTime, "databricks")
+def compile_datetime_databricks(type_, compiler, **kw):
+    """
+    We need to override the default DateTime compilation rendering because Databricks uses "TIMESTAMP" instead of "DATETIME"
+    """
+    return "TIMESTAMP_NTZ"
+
+
+@compiles(sqlalchemy.types.ARRAY, "databricks")
+def compile_array_databricks(type_, compiler, **kw):
+    """
+    SQLAlchemy's default ARRAY can't compile as it's only implemented for Postgresql.
+    The Postgres implementation works for Databricks SQL, so we duplicate that here.
+
+    :type_:
+        This is an instance of sqlalchemy.types.ARRAY which always includes an item_type attribute
+        which is itself an instance of TypeEngine
+
+    https://docs.sqlalchemy.org/en/20/core/type_basics.html#sqlalchemy.types.ARRAY
+    """
+
+    inner = compiler.process(type_.item_type, **kw)
+
+    return f"ARRAY<{inner}>"
+
+
+class DatabricksDateTimeNoTimezoneType(sqlalchemy.types.TypeDecorator):
+    """The decimal that pysql creates when it receives the contents of a TIMESTAMP_NTZ
+    includes a timezone of 'Etc/UTC'.  But since SQLAlchemy's test suite assumes that
+    the sqlalchemy.types.DateTime type will return a datetime.datetime _without_ any
+    timezone set, we need to strip the timezone off the value received from pysql.
+
+    It's not clear if DBR sends a timezone to pysql or if pysql is adding it. This could be a bug.
+    """
+
+    impl = sqlalchemy.types.DateTime
+
+    cache_ok = True
+
+    def process_result_value(self, value: Union[None, datetime], dialect):
+        if value is None:
+            return None
+        return value.replace(tzinfo=None)
+
+
+class DatabricksTimeType(sqlalchemy.types.TypeDecorator):
+    """Databricks has no native TIME type. So we store it as a string."""
+
+    impl = sqlalchemy.types.Time
+    cache_ok = True
+
+    TIME_WITH_MICROSECONDS_FMT = "%H:%M:%S.%f"
+    TIME_NO_MICROSECONDS_FMT = "%H:%M:%S"
+
+    def process_bind_param(self, value: Union[time, None], dialect) -> Union[None, str]:
+        """Values sent to the database are converted to %:H:%M:%S strings."""
+        if value is None:
+            return None
+        return value.strftime(self.TIME_WITH_MICROSECONDS_FMT)
+
+    # mypy doesn't like this workaround because TypeEngine wants process_literal_param to return a string
+    def process_literal_param(self, value, dialect) -> time:  # type: ignore
+        """It's not clear to me why this is necessary. Without it, SQLAlchemy's Timetest:test_literal fails
+        because the string literal renderer receives a str() object and calls .isoformat() on it.
+
+        Whereas this method receives a datetime.time() object which is subsequently passed to that
+        same renderer. And that works.
+
+        UPDATE: After coping with the literal_processor override in DatabricksStringType, I suspect a similar
+        mechanism is at play. Two different processors are are called in sequence. This is likely a byproduct
+        of Databricks not having a true TIME type. I think the string representation of Time() types is
+        somehow affecting the literal rendering process. But as long as this passes the tests, I'm not
+        worried about it.
+        """
+        return value
+
+    def process_result_value(
+        self, value: Union[None, str], dialect
+    ) -> Union[time, None]:
+        """Values received from the database are parsed into datetime.time() objects"""
+        if value is None:
+            return None
+
+        try:
+            _parsed = datetime.strptime(value, self.TIME_WITH_MICROSECONDS_FMT)
+        except ValueError:
+            # If the string doesn't have microseconds, try parsing it without them
+            _parsed = datetime.strptime(value, self.TIME_NO_MICROSECONDS_FMT)
+
+        return _parsed.time()
+
+
+class DatabricksStringType(sqlalchemy.types.TypeDecorator):
+    """We have to implement our own String() type because SQLAlchemy's default implementation
+    wants to escape single-quotes with a doubled single-quote. Databricks uses a backslash for
+    escaping of literal strings. And SQLAlchemy's default escaping breaks Databricks SQL.
+    """
+
+    impl = sqlalchemy.types.String
+    cache_ok = True
+    pe = ParamEscaper()
+
+    def process_literal_param(self, value, dialect) -> str:
+        """SQLAlchemy's default string escaping for backslashes doesn't work for databricks. The logic here
+        implements the same logic as our legacy inline escaping logic.
+        """
+
+        return self.pe.escape_string(value)
+
+    def literal_processor(self, dialect):
+        """We manually override this method to prevent further processing of the string literal beyond
+        what happens in the process_literal_param() method.
+
+        The SQLAlchemy docs _specifically_ say to not override this method.
+
+        It appears that any processing that happens from TypeEngine.process_literal_param happens _before_
+        and _in addition to_ whatever the class's impl.literal_processor() method does. The String.literal_processor()
+        method performs a string replacement that doubles any single-quote in the contained string. This raises a syntax
+        error in Databricks. And it's not necessary because ParamEscaper() already implements all the escaping we need.
+
+        We should consider opening an issue on the SQLAlchemy project to see if I'm using it wrong.
+
+        See type_api.py::TypeEngine.literal_processor:
+
+        ```python
+            def process(value: Any) -> str:
+                return fixed_impl_processor(
+                    fixed_process_literal_param(value, dialect)
+                )
+        ```
+
+        That call to fixed_impl_processor wraps the result of fixed_process_literal_param (which is the
+        process_literal_param defined in our Databricks dialect)
+
+        https://docs.sqlalchemy.org/en/20/core/custom_types.html#sqlalchemy.types.TypeDecorator.literal_processor
+        """
+
+        def process(value):
+            """This is a copy of the default String.literal_processor() method but stripping away
+            its double-escaping behaviour for single-quotes.
+            """
+
+            _step1 = self.process_literal_param(value, dialect="databricks")
+            if dialect.identifier_preparer._double_percents:
+                _step2 = _step1.replace("%", "%%")
+            else:
+                _step2 = _step1
+
+            return "%s" % _step2
+
+        return process
@@ -1,34 +1,96 @@
 """
-This module is supposedly used by the compliance tests to control which tests are run based on database capabilities.
-However, based on some experimentation that does not appear to be consistently the case. Until we better understand
-when these requirements are and are not implemented, we prefer to manually capture the exact nature of the failures
-and errors.
-
-Once we better understand how to use requirements.py, an example exclusion will look like this:
-
-    import sqlalchemy.testing.requirements
-    import sqlalchemy.testing.exclusions
-
-    class Requirements(sqlalchemy.testing.requirements.SuiteRequirements):
-        @property
-        def __some_example_requirement(self):
-            return sqlalchemy.testing.exclusions.closed
-
-
 The complete list of requirements is provided by SQLAlchemy here:
 
 https://github.com/sqlalchemy/sqlalchemy/blob/main/lib/sqlalchemy/testing/requirements.py
+
+When SQLAlchemy skips a test because a requirement is closed() it gives a generic skip message.
+To make these failures more actionable, we only define requirements in this file that we wish to 
+force to be open(). If a test should be skipped on Databricks, it will be specifically marked skip
+in test_suite.py with a Databricks-specific reason.
+
+See the special note about the array_type exclusion below.
 """
 
 import sqlalchemy.testing.requirements
 import sqlalchemy.testing.exclusions
 
-import logging
 
-logger = logging.getLogger(__name__)
+class Requirements(sqlalchemy.testing.requirements.SuiteRequirements):
+    @property
+    def date_historic(self):
+        """target dialect supports representation of Python
+        datetime.datetime() objects with historic (pre 1970) values."""
 
-logger.warning("requirements.py is not currently employed by Databricks dialect")
+        return sqlalchemy.testing.exclusions.open()
 
+    @property
+    def datetime_historic(self):
+        """target dialect supports representation of Python
+        datetime.datetime() objects with historic (pre 1970) values."""
 
-class Requirements(sqlalchemy.testing.requirements.SuiteRequirements):
-    pass
+        return sqlalchemy.testing.exclusions.open()
+
+    @property
+    def datetime_literals(self):
+        """target dialect supports rendering of a date, time, or datetime as a
+        literal string, e.g. via the TypeEngine.literal_processor() method.
+
+        """
+
+        return sqlalchemy.testing.exclusions.open()
+
+    @property
+    def timestamp_microseconds(self):
+        """target dialect supports representation of Python
+        datetime.datetime() with microsecond objects but only
+        if TIMESTAMP is used."""
+
+        return sqlalchemy.testing.exclusions.open()
+
+    @property
+    def time_microseconds(self):
+        """target dialect supports representation of Python
+        datetime.time() with microsecond objects.
+
+        This requirement declaration isn't needed but I've included it here for completeness.
+        Since Databricks doesn't have a TIME type, SQLAlchemy will compile Time() columns
+        as STRING Databricks data types. And we use a custom time type to render those strings
+        between str() and time.time() representations. Therefore we can store _any_ precision
+        that SQLAlchemy needs. The time_microseconds requirement defaults to ON for all dialects
+        except mssql, mysql, mariadb, and oracle.
+        """
+
+        return sqlalchemy.testing.exclusions.open()
+
+    @property
+    def infinity_floats(self):
+        """The Float type can persist and load float('inf'), float('-inf')."""
+
+        return sqlalchemy.testing.exclusions.open()
+
+    @property
+    def precision_numerics_retains_significant_digits(self):
+        """A precision numeric type will return empty significant digits,
+        i.e. a value such as 10.000 will come back in Decimal form with
+        the .000 maintained."""
+
+        return sqlalchemy.testing.exclusions.open()
+
+    @property
+    def precision_numerics_many_significant_digits(self):
+        """target backend supports values with many digits on both sides,
+        such as 319438950232418390.273596, 87673.594069654243
+
+        """
+        return sqlalchemy.testing.exclusions.open()
+
+    @property
+    def array_type(self):
+        """While Databricks does support ARRAY types, pysql cannot bind them. So
+        we cannot use them with SQLAlchemy
+
+        Due to a bug in SQLAlchemy, we _must_ define this exclusion as closed() here or else the
+        test runner will crash the pytest process due to an AttributeError
+        """
+
+        return sqlalchemy.testing.exclusions.closed()