Skip to content

Commit

Permalink
fix: Update table regex (fixes #20 #21)
Browse files Browse the repository at this point in the history
  • Loading branch information
christippett committed Aug 2, 2021
1 parent 14fbc30 commit 5a24c5f
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 27 deletions.
20 changes: 10 additions & 10 deletions src/bigquery_view_analyzer/analyzer.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,31 @@
import sys
import logging
import re
from typing import Optional
import sys
from typing import List, Optional

import google.auth
from anytree import LevelOrderIter, NodeMixin, RenderTree
from colorama import Fore, init
from google.cloud import bigquery
from google.cloud.bigquery import Table, AccessEntry, Dataset
from google.cloud.bigquery import AccessEntry, Dataset, Table

STANDARD_SQL_TABLE_PATTERN = r"(?:(?:FROM|JOIN)\s+?)?`(?P<project>[-\w]+?)`?\.`?(?P<dataset>[\w]+?)`?\.`?(?P<table>[\w]+)`?(?!\()\b"
LEGACY_SQL_TABLE_PATTERN = r"(?:(?:FROM|JOIN)\s+?)?\[(?:(?P<project>[-\w]+?)(?:\:))?(?P<dataset>[-\w]+?)\.(?P<table>[-\w]+?)\]"
SQL_TABLE_PATTERN = r"(?:(?:FROM|JOIN)\s+?)[\x60\[]?(?:(?P<project>[\w][-\w]+?)\x60?[\:\.])?\x60?(?P<dataset>[\w]+?)\x60?\.\x60?(?P<table>[\w]+)[\x60\]]?(?:\s|$)"
COMMENTS_PATTERN = r"(\/\*(.|[\r\n])*?\*\/)|(--.*)"

log = logging.getLogger("bqva.analyzer")
init(autoreset=True)

try:
client = bigquery.Client()
except google.auth.exceptions.DefaultCredentialsError as e:
except Exception as e:
log.error(e)
sys.exit(1)


class TableNode(NodeMixin):
table: Table
parent: Optional["TableNode"]
children: List[Optional["TableNode"]]

def __init__(
self, table: Table, parent: Optional["TableNode"] = None, children=None
):
Expand Down Expand Up @@ -185,9 +187,7 @@ def _get_table(self, project_id: str, dataset_id: str, table_id: str) -> Table:
def extract_table_references(query, is_legacy_sql):
# Remove comments from query to avoid picking up tables from commented out SQL code
view_query = re.sub(COMMENTS_PATTERN, "", query)
table_pattern = (
LEGACY_SQL_TABLE_PATTERN if is_legacy_sql else STANDARD_SQL_TABLE_PATTERN
)
table_pattern = LEGACY_SQL_TABLE_PATTERN if is_legacy_sql else SQL_TABLE_PATTERN
tables = re.findall(table_pattern, view_query, re.IGNORECASE | re.MULTILINE)
return tables

Expand Down
25 changes: 8 additions & 17 deletions tests/test_view_analyzer.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,20 @@
import re
import pytest

from bigquery_view_analyzer.analyzer import (
STANDARD_SQL_TABLE_PATTERN,
LEGACY_SQL_TABLE_PATTERN,
ViewAnalyzer,
)
import pytest
from bigquery_view_analyzer.analyzer import SQL_TABLE_PATTERN, ViewAnalyzer

valid_standard_table_references = [
"`project.dataset.table`",
"`project`.dataset.table",
"`project.dataset`.table",
"`project`.`dataset`.`table`",
"project.dataset.table",
]

invalid_standard_table_references = [
"project.`dataset`.table",
"project.`dataset.table`",
"project.dataset.`table`",
"project.dataset.table",
"`project`.dataset.function()",
"`dataset.function()`",
"dataset.function()",
]

legacy_table_references = ["[project:dataset.table]", "[dataset.table]"]
Expand All @@ -43,9 +38,7 @@ def test_valid_standard_table_reference_in_view(table_a, table_b, join_prefix):
""".format(
table_a=table_a, table_b=table_b, join_prefix=join_prefix
)
match = re.findall(
STANDARD_SQL_TABLE_PATTERN, sql_ddl, re.IGNORECASE | re.MULTILINE
)
match = re.findall(SQL_TABLE_PATTERN, sql_ddl, re.IGNORECASE | re.MULTILINE)
assert match is not None
assert len(match) == 2 # find both table a and b

Expand All @@ -67,9 +60,7 @@ def test_invalid_standard_table_reference_in_view(table_a, table_b, join_prefix)
""".format(
table_a=table_a, table_b=table_b, join_prefix=join_prefix
)
match = re.findall(
STANDARD_SQL_TABLE_PATTERN, sql_ddl, re.IGNORECASE | re.MULTILINE
)
match = re.findall(SQL_TABLE_PATTERN, sql_ddl, re.IGNORECASE | re.MULTILINE)
assert match == []


Expand All @@ -90,7 +81,7 @@ def test_legacy_table_reference_in_view(table_a, table_b, join_prefix):
""".format(
table_a=table_a, table_b=table_b, join_prefix=join_prefix
)
match = re.findall(LEGACY_SQL_TABLE_PATTERN, sql_ddl, re.IGNORECASE | re.MULTILINE)
match = re.findall(SQL_TABLE_PATTERN, sql_ddl, re.IGNORECASE | re.MULTILINE)
assert match is not None
assert len(match) == 2 # find both table a and b

Expand Down

0 comments on commit 5a24c5f

Please sign in to comment.