From 10f23d33833f409d3e4f400e194d1f51b6ac0df6 Mon Sep 17 00:00:00 2001 From: zshandy Date: Sat, 27 Apr 2024 23:36:47 -0700 Subject: [PATCH] fixing quotes for sqlite --- lineagex/ColumnLineageNoConn.py | 2 +- lineagex/LineageXNoConn.py | 2 +- lineagex/SqlToDict.py | 2 +- lineagex/utils.py | 41 ++++++++++++++++++++++++--------- pyproject.toml | 2 +- 5 files changed, 34 insertions(+), 15 deletions(-) diff --git a/lineagex/ColumnLineageNoConn.py b/lineagex/ColumnLineageNoConn.py index 08de126..5b91fce 100644 --- a/lineagex/ColumnLineageNoConn.py +++ b/lineagex/ColumnLineageNoConn.py @@ -71,7 +71,7 @@ def __init__( self.unnest_dict = {} self.input_table_dict = input_table_dict # self.sql_ast = parse_one(sql, read=dialect) - self.sql_ast = parse_one_sql(sql=sql) + self.sql_ast = parse_one_sql(sql='''''' + sql + '''''') self.all_used_col = [] self.table_list = [] self.all_subquery_table = [] diff --git a/lineagex/LineageXNoConn.py b/lineagex/LineageXNoConn.py index 9476748..ffe499f 100644 --- a/lineagex/LineageXNoConn.py +++ b/lineagex/LineageXNoConn.py @@ -59,7 +59,7 @@ def _find_lineage_no_conn(self): for name, sql in self.sql_files_dict.items(): try: # sql_ast = parse_one(sql, read=self.dialect) - sql_ast = parse_one_sql(sql=sql) + sql_ast = parse_one_sql(sql='''''' + sql + '''''') all_tables = self._resolve_table(part_ast=sql_ast) for t in all_tables: if t in self.sql_files_dict.keys() and t not in self.finished_list: diff --git a/lineagex/SqlToDict.py b/lineagex/SqlToDict.py index 15b10da..4bb8322 100644 --- a/lineagex/SqlToDict.py +++ b/lineagex/SqlToDict.py @@ -70,7 +70,7 @@ def _preprocess_sql( ret_sql = remove_comments(str1=new_sql) if self.dialect == "sqlite": ret_sql = ret_sql.replace('"', "'") - ret_sql = ret_sql.replace("`", '"') + ret_sql = ret_sql.replace(" REL)", " REAL)").replace("IS NOT ''", "IS NOT NULL").replace("`", '"') elif self.dialect == "postgres": ret_sql = ret_sql.replace("`", '') # remove any database names in the query diff --git a/lineagex/utils.py b/lineagex/utils.py index 2ea3385..527f018 100644 --- a/lineagex/utils.py +++ b/lineagex/utils.py @@ -19,17 +19,36 @@ def remove_comments(str1: Optional[str] = "") -> str: # remove trailing -- and # comments # pattern = r"(?:--|#)(?!.*(['""])[^'""]*\1)[^'\n\r]*" # q = " ".join([re.sub(pattern, "", line) for line in lines]) - q = " ".join( - [ - re.split("--|#", line)[0] - if line.find("'#") == -1 - and line.find('"#') == -1 - and line.find("'--") == -1 - and line.find('"--') == -1 - else line - for line in lines - ] - ) + q = "" + comment_symbol = ["--", "#"] + for line in lines: + new_line = line + for c in comment_symbol: + quoted = False + # if there is a comment symbol + if line.find(c) != -1: + c_idx = line.find(c) + # if there is a ' on the left + if line.rfind("'", c_idx) != -1: + q_idx = line.rfind("'", c_idx) + # find the corresponding ' on the right + if line.find("'", q_idx) != -1: + quoted = True + if not quoted: + new_line = re.split("--|#", line)[0] + q += " " + new_line + + # q = " ".join( + # [ + # re.split("--|#", line)[0] + # if line.find("'#") == -1 + # and line.find('"#') == -1 + # and line.find("'--") == -1 + # and line.find('"--') == -1 + # else line + # for line in lines + # ] + # ) # replace all spaces around commas q = re.sub(r"\s*,\s*", ",", q) # replace all multiple spaces to one space diff --git a/pyproject.toml b/pyproject.toml index 0f8f8d1..4461d6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "lineagex" -version = "0.0.25" +version = "0.0.26" description = "A column lineage tool" authors = ["zshandy "] license = "MIT"