sqlite support

sfu-db · Feb 10, 2025 · 5303ab4 · 5303ab4
1 parent 10f23d3
commit 5303ab4
Show file tree

Hide file tree

Showing 7 changed files with 56 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -66,12 +66,13 @@ When entering the `conn_string` parameter, only supported databases' connection
 ## Database Connection Types
 - [x] Postgres
 - [x] dbt-Postgres
+- [x] Sqlite
 - [ ] Mysql
-- [ ] Sqlite
 - [ ] SQL Server
 - [ ] Oracle
 - [ ] ...
 
 
 # Documentation
-Doc: https://sfu-db.github.io/lineagex/intro.html or just [here](https://sfu-db.github.io/lineagex/intro.html)
+Doc: https://sfu-db.github.io/lineagex/intro.html or just [here](https://sfu-db.github.io/lineagex/intro.html)
+Javascript Source: Compiled and forked from [here](https://github.com/Bert0324/lineage-dag) by Bert Huang
diff --git a/lineagex/ColumnLineageNoConn.py b/lineagex/ColumnLineageNoConn.py
@@ -340,9 +340,10 @@ def _sub_shared_col_conds_cte(
                     )
                     all_cte_sub_cols.extend(temp_sub_cols)
                     if len(all_cte_sub_table) == 1:
-                        self.table_alias_dict[
-                            sub_ast.find(exp.TableAlias).alias_or_name
-                        ] = all_cte_sub_table[0]
+                        if sub_ast.find(exp.TableAlias):
+                            self.table_alias_dict[
+                                sub_ast.find(exp.TableAlias).alias_or_name
+                            ] = all_cte_sub_table[0]
                     potential_cte_sub_table = temp_sub_table
                     if type(sub_ast.parent) in from_join_exp:
                         temp_sub_dict = {}
@@ -359,7 +360,10 @@ def _sub_shared_col_conds_cte(
                         #         target_dict=temp_sub_dict,
                         #         source_table=temp_sub_table,
                         #     )
-                        sub_name = sub_ast.find(exp.TableAlias).alias_or_name
+                        if sub_ast.find(exp.TableAlias) is not None:
+                            sub_name = sub_ast.find(exp.TableAlias).alias_or_name
+                        else:
+                            sub_name = "no_name_subquery"
                         self.cte_dict[sub_name] = temp_sub_dict
                         sub_ast.replace(exp.Table(this=sub_name))
                     sub_ast.pop()
@@ -745,7 +749,8 @@ def _find_alias_col(
         if len(temp) < 2:
             for t in temp_table:
                 if t in self.input_table_dict.keys():
-                    if col_sql in self.input_table_dict[t]:
+                    # resolve any case mismatching
+                    if col_sql.lower() in [x.lower() for x in self.input_table_dict[t]]:
                         if ref:
                             return [[], [t + "." + col_sql]]
                         else:
@@ -927,7 +932,7 @@ def _resolve_agg_star(
                                 temp_col = temp_col + cols[0] + cols[1]
                             target_dict[col_name] = [
                                 [""],
-                                list(self.all_used_col.union(set(temp_col))),
+                                list(set(self.all_used_col).union(set(temp_col))),
                             ]
                         elif t_name in self.cte_dict.keys():
                             temp_col = []

diff --git a/lineagex/LineageXNoConn.py b/lineagex/LineageXNoConn.py
@@ -35,6 +35,7 @@ def __init__(
         dialect: str = "postgres",
         target_schema: Optional[str] = "public",
         search_path_schema: Optional[str] = "public",
+        input_table_dict: Optional[dict] = None
     ) -> None:
         self.output_dict = {}
         self.parsed = 0
@@ -45,7 +46,10 @@ def __init__(
         self.sql_files_dict = s2d.sql_files_dict
         self.org_sql_files_dict = s2d.org_sql_files_dict
         self.dialect = dialect
-        self.input_table_dict = {}
+        if input_table_dict is None:
+            self.input_table_dict = {}
+        else:
+            self.input_table_dict = input_table_dict
         self.finished_list = []
         self._find_lineage_no_conn()
 
@@ -58,7 +62,8 @@ def _find_lineage_no_conn(self):
         start_time = time.time()
         for name, sql in self.sql_files_dict.items():
             try:
-                # sql_ast = parse_one(sql, read=self.dialect)
+                sql_ast = parse_one(sql, read=self.dialect)
+                #print(sql)
                 sql_ast = parse_one_sql(sql='''''' + sql + '''''')
                 all_tables = self._resolve_table(part_ast=sql_ast)
                 for t in all_tables:

diff --git a/lineagex/SqlToDict.py b/lineagex/SqlToDict.py
@@ -2,7 +2,7 @@
 import re
 from typing import List, Optional, Union
 
-from .utils import find_select, get_files, remove_comments
+from .utils import find_select, get_files, remove_comments_sqlite, remove_comments_pg
 
 rem_regex = re.compile(r"[^a-zA-Z0-9_.]")
 
@@ -35,7 +35,13 @@ def _sql_to_dict(self) -> None:
             self.sql_files = get_files(path=self.path)
             for f in self.sql_files:
                 org_sql = open(f, mode="r", encoding="latin-1").read()
-                new_sql = remove_comments(str1=org_sql)
+                if self.dialect == "sqlite":
+                    new_sql = remove_comments_sqlite(str1=org_sql)
+                elif self.dialect == "postgres":
+                    new_sql = remove_comments_pg(str1=org_sql)
+                else:
+                    new_sql = remove_comments_pg(str1=org_sql)
+                #new_sql = remove_comments(str1=org_sql)
                 org_sql_split = list(filter(None, new_sql.split(";")))
                 # pop DROP IF EXISTS
                 if len(org_sql_split) > 0:
@@ -67,11 +73,13 @@ def _preprocess_sql(
         :param new_sql: the sql for parsing, file: file name for the sql, org_sql: the most original sql
         :return: None
         """
-        ret_sql = remove_comments(str1=new_sql)
+        #ret_sql = remove_comments(str1=new_sql)
         if self.dialect == "sqlite":
+            ret_sql = remove_comments_sqlite(str1=new_sql)
             ret_sql = ret_sql.replace('"', "'")
             ret_sql = ret_sql.replace(" REL)", " REAL)").replace("IS NOT ''", "IS NOT NULL").replace("`", '"')
         elif self.dialect == "postgres":
+            ret_sql = remove_comments_pg(str1=new_sql)
             ret_sql = ret_sql.replace("`", '')
         # remove any database names in the query
         if self.schema_list:

diff --git a/lineagex/lineagex.py b/lineagex/lineagex.py
@@ -50,6 +50,7 @@ def __init__(
         conn_string: Optional[str] = None,
         search_path_schema: Optional[str] = "",
         dialect: str = "postgres",
+        input_table_dict: Optional[dict] = None,
     ) -> None:
         validate_sql(sql)
         target_schema, search_path_schema = validate_schema(
@@ -74,6 +75,7 @@ def __init__(
                 dialect=dialect,
                 target_schema=target_schema,
                 search_path_schema=search_path_schema,
+                input_table_dict=input_table_dict,
             )
             save_js_file()
             self.output_dict = lx.output_dict

diff --git a/lineagex/utils.py b/lineagex/utils.py
@@ -6,7 +6,7 @@
 from psycopg2.extensions import connection
 
 
-def remove_comments(str1: Optional[str] = "") -> str:
+def remove_comments_sqlite(str1: Optional[str] = "") -> str:
     """
     Remove comments/excessive spaces/"create table as"/"create view as" from the sql file
     :param str1: the original sql
@@ -57,6 +57,26 @@ def remove_comments(str1: Optional[str] = "") -> str:
     return str1
 
 
+def remove_comments_pg(str1: Optional[str] = "") -> str:
+    """
+    Remove comments/excessive spaces/"create table as"/"create view as" from the sql file
+    :param str1: the original sql
+    :return: the parsed sql
+    """
+    # remove the /* */ comments
+    q = re.sub(r"/\*[^*]*\*+(?:[^*/][^*]*\*+)*/", "", str1)
+    # remove whole line -- and # comments
+    lines = [line for line in q.splitlines() if not re.match("^\s*(--|#)", line)]
+    # remove trailing -- and # comments
+    q = " ".join([re.split("--|#", line)[0] for line in lines])
+    # replace all spaces around commas
+    q = re.sub(r"\s*,\s*", ",", q)
+    # replace all multiple spaces to one space
+    str1 = re.sub("\s\s+", " ", q)
+    str1 = str1.replace("\n", " ").strip()
+    return str1
+
+
 def find_column(
     table_name: Optional[str] = "",
     engine: Any = None,

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "lineagex"
-version = "0.0.26"
+version = "0.0.27"
 description = "A column lineage tool"
 authors = ["zshandy <[email protected]>"]
 license = "MIT"