diff --git a/src/bigquery_view_analyzer/analyzer.py b/src/bigquery_view_analyzer/analyzer.py index 56f0194..2193ec3 100644 --- a/src/bigquery_view_analyzer/analyzer.py +++ b/src/bigquery_view_analyzer/analyzer.py @@ -1,16 +1,14 @@ -import sys import logging import re -from typing import Optional +import sys +from typing import List, Optional -import google.auth from anytree import LevelOrderIter, NodeMixin, RenderTree from colorama import Fore, init from google.cloud import bigquery -from google.cloud.bigquery import Table, AccessEntry, Dataset +from google.cloud.bigquery import AccessEntry, Dataset, Table -STANDARD_SQL_TABLE_PATTERN = r"(?:(?:FROM|JOIN)\s+?)?`(?P[-\w]+?)`?\.`?(?P[\w]+?)`?\.`?(?P[\w]+)`?(?!\()\b" -LEGACY_SQL_TABLE_PATTERN = r"(?:(?:FROM|JOIN)\s+?)?\[(?:(?P[-\w]+?)(?:\:))?(?P[-\w]+?)\.(?P
[-\w]+?)\]" +SQL_TABLE_PATTERN = r"(?:(?:FROM|JOIN)\s+?)[\x60\[]?(?:(?P[\w][-\w]+?)\x60?[\:\.])?\x60?(?P[\w]+?)\x60?\.\x60?(?P
[\w]+)[\x60\]]?(?:\s|$)" COMMENTS_PATTERN = r"(\/\*(.|[\r\n])*?\*\/)|(--.*)" log = logging.getLogger("bqva.analyzer") @@ -18,12 +16,16 @@ try: client = bigquery.Client() -except google.auth.exceptions.DefaultCredentialsError as e: +except Exception as e: log.error(e) sys.exit(1) class TableNode(NodeMixin): + table: Table + parent: Optional["TableNode"] + children: List[Optional["TableNode"]] + def __init__( self, table: Table, parent: Optional["TableNode"] = None, children=None ): @@ -185,9 +187,7 @@ def _get_table(self, project_id: str, dataset_id: str, table_id: str) -> Table: def extract_table_references(query, is_legacy_sql): # Remove comments from query to avoid picking up tables from commented out SQL code view_query = re.sub(COMMENTS_PATTERN, "", query) - table_pattern = ( - LEGACY_SQL_TABLE_PATTERN if is_legacy_sql else STANDARD_SQL_TABLE_PATTERN - ) + table_pattern = LEGACY_SQL_TABLE_PATTERN if is_legacy_sql else SQL_TABLE_PATTERN tables = re.findall(table_pattern, view_query, re.IGNORECASE | re.MULTILINE) return tables diff --git a/tests/test_view_analyzer.py b/tests/test_view_analyzer.py index 3c57e1b..0c053ef 100644 --- a/tests/test_view_analyzer.py +++ b/tests/test_view_analyzer.py @@ -1,25 +1,20 @@ import re -import pytest -from bigquery_view_analyzer.analyzer import ( - STANDARD_SQL_TABLE_PATTERN, - LEGACY_SQL_TABLE_PATTERN, - ViewAnalyzer, -) +import pytest +from bigquery_view_analyzer.analyzer import SQL_TABLE_PATTERN, ViewAnalyzer valid_standard_table_references = [ "`project.dataset.table`", "`project`.dataset.table", "`project.dataset`.table", "`project`.`dataset`.`table`", + "project.dataset.table", ] invalid_standard_table_references = [ - "project.`dataset`.table", - "project.`dataset.table`", - "project.dataset.`table`", - "project.dataset.table", "`project`.dataset.function()", + "`dataset.function()`", + "dataset.function()", ] legacy_table_references = ["[project:dataset.table]", "[dataset.table]"] @@ -43,9 +38,7 @@ def test_valid_standard_table_reference_in_view(table_a, table_b, join_prefix): """.format( table_a=table_a, table_b=table_b, join_prefix=join_prefix ) - match = re.findall( - STANDARD_SQL_TABLE_PATTERN, sql_ddl, re.IGNORECASE | re.MULTILINE - ) + match = re.findall(SQL_TABLE_PATTERN, sql_ddl, re.IGNORECASE | re.MULTILINE) assert match is not None assert len(match) == 2 # find both table a and b @@ -67,9 +60,7 @@ def test_invalid_standard_table_reference_in_view(table_a, table_b, join_prefix) """.format( table_a=table_a, table_b=table_b, join_prefix=join_prefix ) - match = re.findall( - STANDARD_SQL_TABLE_PATTERN, sql_ddl, re.IGNORECASE | re.MULTILINE - ) + match = re.findall(SQL_TABLE_PATTERN, sql_ddl, re.IGNORECASE | re.MULTILINE) assert match == [] @@ -90,7 +81,7 @@ def test_legacy_table_reference_in_view(table_a, table_b, join_prefix): """.format( table_a=table_a, table_b=table_b, join_prefix=join_prefix ) - match = re.findall(LEGACY_SQL_TABLE_PATTERN, sql_ddl, re.IGNORECASE | re.MULTILINE) + match = re.findall(SQL_TABLE_PATTERN, sql_ddl, re.IGNORECASE | re.MULTILINE) assert match is not None assert len(match) == 2 # find both table a and b