From 912f5b4efc732b944b3d21ad220e3d0859e21678 Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Mon, 23 Oct 2023 12:15:12 -0400 Subject: [PATCH 01/14] Fix bad type return per mypy with new test for exception Signed-off-by: Jesse Whitehouse --- src/databricks/sqlalchemy/_parse.py | 19 +++++++++++++------ .../sqlalchemy/test_local/test_parsing.py | 7 +++++++ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/databricks/sqlalchemy/_parse.py b/src/databricks/sqlalchemy/_parse.py index 55f34f95..df4ae245 100644 --- a/src/databricks/sqlalchemy/_parse.py +++ b/src/databricks/sqlalchemy/_parse.py @@ -11,6 +11,8 @@ wrappers around regexes. """ +class DatabricksSqlAlchemyParseException(Exception): + pass def _match_table_not_found_string(message: str) -> bool: """Return True if the message contains a substring indicating that a table was not found""" @@ -69,12 +71,14 @@ def extract_three_level_identifier_from_constraint_string(input_str: str) -> dic "schema": "pysql_dialect_compliance", "table": "users" } + + Raise a DatabricksSqlAlchemyParseException if a 3L namespace isn't found """ pat = re.compile(r"REFERENCES\s+(.*?)\s*\(") matches = pat.findall(input_str) if not matches: - return None + raise DatabricksSqlAlchemyParseException("3L namespace not found in constraint string") first_match = matches[0] parts = first_match.split(".") @@ -82,11 +86,14 @@ def extract_three_level_identifier_from_constraint_string(input_str: str) -> dic def strip_backticks(input: str): return input.replace("`", "") - return { - "catalog": strip_backticks(parts[0]), - "schema": strip_backticks(parts[1]), - "table": strip_backticks(parts[2]), - } + try: + return { + "catalog": strip_backticks(parts[0]), + "schema": strip_backticks(parts[1]), + "table": strip_backticks(parts[2]), + } + except IndexError: + raise DatabricksSqlAlchemyParseException("Incomplete 3L namespace found in constraint string: " + ".".join(parts)) def _parse_fk_from_constraint_string(constraint_str: str) -> dict: diff --git a/src/databricks/sqlalchemy/test_local/test_parsing.py b/src/databricks/sqlalchemy/test_local/test_parsing.py index ab82613e..6cb1591f 100644 --- a/src/databricks/sqlalchemy/test_local/test_parsing.py +++ b/src/databricks/sqlalchemy/test_local/test_parsing.py @@ -6,6 +6,7 @@ build_fk_dict, build_pk_dict, match_dte_rows_by_value, + DatabricksSqlAlchemyParseException ) @@ -54,6 +55,12 @@ def test_extract_3l_namespace_from_constraint_string(): extract_three_level_identifier_from_constraint_string(input) == expected ), "Failed to extract 3L namespace from constraint string" +def test_extract_3l_namespace_from_bad_constraint_string(): + input = "FOREIGN KEY (`parent_user_id`) REFERENCES `pysql_dialect_compliance`.`users` (`user_id`)" + + with pytest.raises(DatabricksSqlAlchemyParseException): + extract_three_level_identifier_from_constraint_string(input) + @pytest.mark.parametrize("schema", [None, "some_schema"]) def test_build_fk_dict(schema): From c13b4b0399da5bf2beb084945cba4ce1442328c8 Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Mon, 23 Oct 2023 12:16:38 -0400 Subject: [PATCH 02/14] Ignore type checks for dialect compliance tests mypy doesn't like wildcard imports, but they are necessary here and con -sistent with SQLAlchemy's guidance for structuring tests Signed-off-by: Jesse Whitehouse --- src/databricks/sqlalchemy/test/_future.py | 2 ++ src/databricks/sqlalchemy/test/_regression.py | 2 ++ src/databricks/sqlalchemy/test/_unsupported.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/src/databricks/sqlalchemy/test/_future.py b/src/databricks/sqlalchemy/test/_future.py index 0da38634..519a4e09 100644 --- a/src/databricks/sqlalchemy/test/_future.py +++ b/src/databricks/sqlalchemy/test/_future.py @@ -1,3 +1,5 @@ +# type: ignore + from enum import Enum import pytest diff --git a/src/databricks/sqlalchemy/test/_regression.py b/src/databricks/sqlalchemy/test/_regression.py index aeeb5c3f..6342d2d5 100644 --- a/src/databricks/sqlalchemy/test/_regression.py +++ b/src/databricks/sqlalchemy/test/_regression.py @@ -1,3 +1,5 @@ +# type: ignore + import pytest from sqlalchemy.testing.suite import ( ArgSignatureTest, diff --git a/src/databricks/sqlalchemy/test/_unsupported.py b/src/databricks/sqlalchemy/test/_unsupported.py index 63932fe2..6db1a154 100644 --- a/src/databricks/sqlalchemy/test/_unsupported.py +++ b/src/databricks/sqlalchemy/test/_unsupported.py @@ -1,3 +1,5 @@ +# type: ignore + from enum import Enum import pytest From 5b51f26b7877dbb1af52b2e6ead80600b4df8a08 Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Mon, 23 Oct 2023 12:20:01 -0400 Subject: [PATCH 03/14] Update type hint I changed the format of the output of dialect._describe_table_extended to return a list of dicts rather than a list of lists but didn't update this type hint Signed-off-by: Jesse Whitehouse --- src/databricks/sqlalchemy/_parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/sqlalchemy/_parse.py b/src/databricks/sqlalchemy/_parse.py index df4ae245..dbfb988f 100644 --- a/src/databricks/sqlalchemy/_parse.py +++ b/src/databricks/sqlalchemy/_parse.py @@ -241,7 +241,7 @@ def match_dte_rows_by_value(dte_output: List[Dict[str, str]], match: str) -> Lis return output_rows -def get_fk_strings_from_dte_output(dte_output: List[List]) -> List[dict]: +def get_fk_strings_from_dte_output(dte_output: List[Dict[str, str]]) -> List[dict]: """If the DESCRIBE TABLE EXTENDED output contains foreign key constraints, return a list of dictionaries, one dictionary per defined constraint """ From 3a113cccd67d96e4f80c3620e549c0f2c9e2d206 Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Mon, 23 Oct 2023 12:24:59 -0400 Subject: [PATCH 04/14] Fix type check failure This method doesn't handle cases where the regex doesn't match anything Signed-off-by: Jesse Whitehouse --- src/databricks/sqlalchemy/_parse.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/databricks/sqlalchemy/_parse.py b/src/databricks/sqlalchemy/_parse.py index dbfb988f..fcc92520 100644 --- a/src/databricks/sqlalchemy/_parse.py +++ b/src/databricks/sqlalchemy/_parse.py @@ -11,9 +11,11 @@ wrappers around regexes. """ + class DatabricksSqlAlchemyParseException(Exception): pass + def _match_table_not_found_string(message: str) -> bool: """Return True if the message contains a substring indicating that a table was not found""" @@ -78,7 +80,9 @@ def extract_three_level_identifier_from_constraint_string(input_str: str) -> dic matches = pat.findall(input_str) if not matches: - raise DatabricksSqlAlchemyParseException("3L namespace not found in constraint string") + raise DatabricksSqlAlchemyParseException( + "3L namespace not found in constraint string" + ) first_match = matches[0] parts = first_match.split(".") @@ -93,7 +97,9 @@ def strip_backticks(input: str): "table": strip_backticks(parts[2]), } except IndexError: - raise DatabricksSqlAlchemyParseException("Incomplete 3L namespace found in constraint string: " + ".".join(parts)) + raise DatabricksSqlAlchemyParseException( + "Incomplete 3L namespace found in constraint string: " + ".".join(parts) + ) def _parse_fk_from_constraint_string(constraint_str: str) -> dict: @@ -314,7 +320,11 @@ def parse_column_info_from_tgetcolumnsresponse(thrift_resp_row) -> ReflectedColu """ pat = re.compile(r"^\w+") - _raw_col_type = re.search(pat, thrift_resp_row.TYPE_NAME).group(0).lower() + + # This method assumes a valid TYPE_NAME field in the response. + # TODO: add error handling in case TGetColumnsResponse format changes + + _raw_col_type = re.search(pat, thrift_resp_row.TYPE_NAME).group(0).lower() # type: ignore _col_type = GET_COLUMNS_TYPE_MAP[_raw_col_type] if _raw_col_type == "decimal": From 95dc780443d848f7bf4ea4fdc5f53f0b4ffc0e43 Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Mon, 23 Oct 2023 12:57:29 -0400 Subject: [PATCH 05/14] Fix type hint around dictionary overloading Signed-off-by: Jesse Whitehouse --- src/databricks/sqlalchemy/_parse.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/databricks/sqlalchemy/_parse.py b/src/databricks/sqlalchemy/_parse.py index fcc92520..e4c46830 100644 --- a/src/databricks/sqlalchemy/_parse.py +++ b/src/databricks/sqlalchemy/_parse.py @@ -183,10 +183,12 @@ def build_fk_dict( else: schema_override_dict = {} + # mypy doesn't like this method of conditionally adding a key to a dictionary + # while keeping everything immutable complete_foreign_key_dict = { "name": fk_name, **base_fk_dict, - **schema_override_dict, + **schema_override_dict, # type: ignore } return complete_foreign_key_dict From ffc5211085edac6bd528cabecde3d764a37e0c29 Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Mon, 23 Oct 2023 14:28:34 -0400 Subject: [PATCH 06/14] Skip ReflectedColumn type return hint Signed-off-by: Jesse Whitehouse --- src/databricks/sqlalchemy/_parse.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/databricks/sqlalchemy/_parse.py b/src/databricks/sqlalchemy/_parse.py index e4c46830..25bfeb33 100644 --- a/src/databricks/sqlalchemy/_parse.py +++ b/src/databricks/sqlalchemy/_parse.py @@ -353,4 +353,5 @@ def parse_column_info_from_tgetcolumnsresponse(thrift_resp_row) -> ReflectedColu "default": thrift_resp_row.COLUMN_DEF, } - return this_column + # TODO: figure out how to return sqlalchemy.interfaces in a way that mypy respects + return this_column # type: ignore From 34daef6bbdbde4026730ede73b6fc032a8b81303 Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Mon, 23 Oct 2023 14:31:04 -0400 Subject: [PATCH 07/14] Add type hint to EMPTY_FK and EMPTY_LIST Signed-off-by: Jesse Whitehouse --- src/databricks/sqlalchemy/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/databricks/sqlalchemy/base.py b/src/databricks/sqlalchemy/base.py index 937d0a9e..f8198f38 100644 --- a/src/databricks/sqlalchemy/base.py +++ b/src/databricks/sqlalchemy/base.py @@ -73,10 +73,12 @@ class DatabricksDialect(default.DefaultDialect): # SQLAlchemy requires that a table with no primary key # constraint return a dictionary that looks like this. - EMPTY_PK = {"constrained_columns": [], "name": None} + EMPTY_PK: Dict[str, Any] = {"constrained_columns": [], "name": None} # SQLAlchemy requires that a table with no foreign keys # defined return an empty list. Same for indexes. + EMPTY_FK: List + EMPTY_INDEX: List EMPTY_FK = EMPTY_INDEX = [] @classmethod From ea0b611899050c78b1320b09a07ba2a6d6a965bb Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Mon, 23 Oct 2023 14:39:24 -0400 Subject: [PATCH 08/14] Conform implementation of _parse method to the type hints declared in base.py. I re-ran HasTableTest which depend on this working to confirm that it does not cause a regression. Signed-off-by: Jesse Whitehouse --- src/databricks/sqlalchemy/_parse.py | 2 +- src/databricks/sqlalchemy/base.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/databricks/sqlalchemy/_parse.py b/src/databricks/sqlalchemy/_parse.py index 25bfeb33..ea11847c 100644 --- a/src/databricks/sqlalchemy/_parse.py +++ b/src/databricks/sqlalchemy/_parse.py @@ -35,7 +35,7 @@ def _describe_table_extended_result_to_dict_list( """Transform the CursorResult of DESCRIBE TABLE EXTENDED into a list of Dictionaries""" rows_to_return = [] - for row in result: + for row in result.all(): this_row = {"col_name": row.col_name, "data_type": row.data_type} rows_to_return.append(this_row) diff --git a/src/databricks/sqlalchemy/base.py b/src/databricks/sqlalchemy/base.py index f8198f38..5bc864fb 100644 --- a/src/databricks/sqlalchemy/base.py +++ b/src/databricks/sqlalchemy/base.py @@ -1,5 +1,5 @@ import re -from typing import Any, List, Optional, Dict, Collection, Iterable, Tuple +from typing import Any, List, Optional, Dict, Union, Collection, Iterable, Tuple import databricks.sqlalchemy._ddl as dialect_ddl_impl import databricks.sqlalchemy._types as dialect_type_impl @@ -141,7 +141,7 @@ def _describe_table_extended( catalog_name: Optional[str] = None, schema_name: Optional[str] = None, expect_result=True, - ) -> List[Dict[str, str]]: + ) -> Union[List[Dict[str, str]], None]: """Run DESCRIBE TABLE EXTENDED on a table and return a list of dictionaries of the result. This method is the fastest way to check for the presence of a table in a schema. @@ -160,7 +160,7 @@ def _describe_table_extended( stmt = DDL(f"DESCRIBE TABLE EXTENDED {_target}") try: - result = connection.execute(stmt).all() + result = connection.execute(stmt) except DatabaseError as e: if _match_table_not_found_string(str(e)): raise sqlalchemy.exc.NoSuchTableError( From 4f4c4e59f68c79dc7209adeeb3ce5cd66c3e1901 Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Mon, 23 Oct 2023 14:48:33 -0400 Subject: [PATCH 09/14] Skip type hint for value assignment I'm not clear how to instruct mypy to know that the value of `result` will never be None given the way we called _describe_table_extended Signed-off-by: Jesse Whitehouse --- src/databricks/sqlalchemy/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/databricks/sqlalchemy/base.py b/src/databricks/sqlalchemy/base.py index 5bc864fb..71cfca44 100644 --- a/src/databricks/sqlalchemy/base.py +++ b/src/databricks/sqlalchemy/base.py @@ -199,7 +199,9 @@ def get_pk_constraint( schema_name=schema, ) - raw_pk_constraints: List = get_pk_strings_from_dte_output(result) + # Type ignore is because mypy knows that self._describe_table_extended *can* + # return None (even though it never will since expect_result defaults to True) + raw_pk_constraints: List = get_pk_strings_from_dte_output(result) # type: ignore if not any(raw_pk_constraints): return self.EMPTY_PK From f154b872bcc77c778ff270de840b95ab452292cc Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Mon, 23 Oct 2023 16:15:13 -0400 Subject: [PATCH 10/14] Ignore type hints that mypy doesn't understand Signed-off-by: Jesse Whitehouse --- src/databricks/sqlalchemy/base.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/databricks/sqlalchemy/base.py b/src/databricks/sqlalchemy/base.py index 71cfca44..b5fe7a37 100644 --- a/src/databricks/sqlalchemy/base.py +++ b/src/databricks/sqlalchemy/base.py @@ -201,9 +201,9 @@ def get_pk_constraint( # Type ignore is because mypy knows that self._describe_table_extended *can* # return None (even though it never will since expect_result defaults to True) - raw_pk_constraints: List = get_pk_strings_from_dte_output(result) # type: ignore + raw_pk_constraints: List = get_pk_strings_from_dte_output(result) # type: ignore if not any(raw_pk_constraints): - return self.EMPTY_PK + return self.EMPTY_PK # type: ignore if len(raw_pk_constraints) > 1: logger.warning( @@ -216,7 +216,8 @@ def get_pk_constraint( pk_name = first_pk_constraint.get("col_name") pk_constraint_string = first_pk_constraint.get("data_type") - return build_pk_dict(pk_name, pk_constraint_string) + # TODO: figure out how to return sqlalchemy.interfaces in a way that mypy respects + return build_pk_dict(pk_name, pk_constraint_string) # type: ignore def get_foreign_keys( self, connection, table_name, schema=None, **kw From 6a77bd5e184905bd21a31aecc719fbe0a85b8759 Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Mon, 23 Oct 2023 16:15:49 -0400 Subject: [PATCH 11/14] Fix inaccurate type hint. This method always returns a list Signed-off-by: Jesse Whitehouse --- src/databricks/sqlalchemy/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/sqlalchemy/base.py b/src/databricks/sqlalchemy/base.py index b5fe7a37..4ef62511 100644 --- a/src/databricks/sqlalchemy/base.py +++ b/src/databricks/sqlalchemy/base.py @@ -221,7 +221,7 @@ def get_pk_constraint( def get_foreign_keys( self, connection, table_name, schema=None, **kw - ) -> ReflectedForeignKeyConstraint: + ) -> List[ReflectedForeignKeyConstraint]: """Return information about foreign_keys in `table_name`.""" result = self._describe_table_extended( From f1049ac37fda09d174be53fa409bb0f0c5cde589 Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Mon, 23 Oct 2023 16:17:00 -0400 Subject: [PATCH 12/14] Similar fix to 7825f96c911f754a2b9eb841425b8af1c1449454 mypy doesn't understand that describe_table_extended will never return a NoneType under these circumstances Signed-off-by: Jesse Whitehouse --- src/databricks/sqlalchemy/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/databricks/sqlalchemy/base.py b/src/databricks/sqlalchemy/base.py index 4ef62511..46ca5dc3 100644 --- a/src/databricks/sqlalchemy/base.py +++ b/src/databricks/sqlalchemy/base.py @@ -230,7 +230,9 @@ def get_foreign_keys( schema_name=schema, ) - raw_fk_constraints: List = get_fk_strings_from_dte_output(result) + # Type ignore is because mypy knows that self._describe_table_extended *can* + # return None (even though it never will since expect_result defaults to True) + raw_fk_constraints: List = get_fk_strings_from_dte_output(result) # type: ignore if not any(raw_fk_constraints): return self.EMPTY_FK From 3fa6c35c55247f7de9d2185cf01852a52d52066a Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Mon, 23 Oct 2023 16:17:54 -0400 Subject: [PATCH 13/14] Similar change to 265ac3ac22da1e19159e81135b52aa8b4f5c0bc4 Not sure how we can actually match the interface mypy expects Signed-off-by: Jesse Whitehouse --- src/databricks/sqlalchemy/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/databricks/sqlalchemy/base.py b/src/databricks/sqlalchemy/base.py index 46ca5dc3..3684789e 100644 --- a/src/databricks/sqlalchemy/base.py +++ b/src/databricks/sqlalchemy/base.py @@ -246,7 +246,8 @@ def get_foreign_keys( ) fk_constraints.append(this_constraint_dict) - return fk_constraints + # TODO: figure out how to return sqlalchemy.interfaces in a way that mypy respects + return fk_constraints # type: ignore def get_indexes(self, connection, table_name, schema=None, **kw): """SQLAlchemy requires this method. Databricks doesn't support indexes.""" From 9df170d87c227408c7adaee6a52132d8d0930b1b Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Mon, 23 Oct 2023 16:18:17 -0400 Subject: [PATCH 14/14] Black source files Signed-off-by: Jesse Whitehouse --- src/databricks/sqlalchemy/_parse.py | 2 +- src/databricks/sqlalchemy/test/_unsupported.py | 2 +- src/databricks/sqlalchemy/test_local/test_parsing.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/databricks/sqlalchemy/_parse.py b/src/databricks/sqlalchemy/_parse.py index ea11847c..3d96e160 100644 --- a/src/databricks/sqlalchemy/_parse.py +++ b/src/databricks/sqlalchemy/_parse.py @@ -354,4 +354,4 @@ def parse_column_info_from_tgetcolumnsresponse(thrift_resp_row) -> ReflectedColu } # TODO: figure out how to return sqlalchemy.interfaces in a way that mypy respects - return this_column # type: ignore + return this_column # type: ignore diff --git a/src/databricks/sqlalchemy/test/_unsupported.py b/src/databricks/sqlalchemy/test/_unsupported.py index 6db1a154..899e73e4 100644 --- a/src/databricks/sqlalchemy/test/_unsupported.py +++ b/src/databricks/sqlalchemy/test/_unsupported.py @@ -1,4 +1,4 @@ -# type: ignore +# type: ignore from enum import Enum diff --git a/src/databricks/sqlalchemy/test_local/test_parsing.py b/src/databricks/sqlalchemy/test_local/test_parsing.py index 6cb1591f..f17814f9 100644 --- a/src/databricks/sqlalchemy/test_local/test_parsing.py +++ b/src/databricks/sqlalchemy/test_local/test_parsing.py @@ -6,7 +6,7 @@ build_fk_dict, build_pk_dict, match_dte_rows_by_value, - DatabricksSqlAlchemyParseException + DatabricksSqlAlchemyParseException, ) @@ -55,6 +55,7 @@ def test_extract_3l_namespace_from_constraint_string(): extract_three_level_identifier_from_constraint_string(input) == expected ), "Failed to extract 3L namespace from constraint string" + def test_extract_3l_namespace_from_bad_constraint_string(): input = "FOREIGN KEY (`parent_user_id`) REFERENCES `pysql_dialect_compliance`.`users` (`user_id`)"