From 5552ab2168261c693062baf9ecbd7d057f1828a9 Mon Sep 17 00:00:00 2001 From: Padraig Alton Date: Fri, 9 Feb 2024 17:05:50 +0000 Subject: [PATCH 1/3] Add support for ephemeral models --- CHANGELOG.md | 4 ++++ dbt2looker/models.py | 5 +++-- dbt2looker/parser.py | 10 +++++----- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9943468..676d862 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ Recent and upcoming changes to dbt2looker +## Unreleased +### Added +- support ephemeral models (#57) + ## 0.11.0 ### Added - support label and hidden fields (#49) diff --git a/dbt2looker/models.py b/dbt2looker/models.py index 3430eaf..9961182 100644 --- a/dbt2looker/models.py +++ b/dbt2looker/models.py @@ -1,5 +1,5 @@ from enum import Enum -from typing import Union, Dict, List, Optional +from typing import Any, Union, Dict, List, Optional try: from typing import Literal except ImportError: @@ -144,6 +144,7 @@ class DbtModelColumn(BaseModel): class DbtNode(BaseModel): unique_id: str resource_type: str + config: Dict[str, Any] class Dbt2LookerExploreJoin(BaseModel): @@ -224,4 +225,4 @@ def case_insensitive_column_names(cls, v: Dict[str, DbtCatalogNodeColumn]): class DbtCatalog(BaseModel): - nodes: Dict[str, DbtCatalogNode] \ No newline at end of file + nodes: Dict[str, DbtCatalogNode] diff --git a/dbt2looker/parser.py b/dbt2looker/parser.py index ed310f3..03045c9 100644 --- a/dbt2looker/parser.py +++ b/dbt2looker/parser.py @@ -31,21 +31,21 @@ def tags_match(query_tag: str, model: models.DbtModel) -> bool: def parse_models(raw_manifest: dict, tag=None) -> List[models.DbtModel]: manifest = models.DbtManifest(**raw_manifest) - all_models: List[models.DbtModel] = [ + materialized_models: List[models.DbtModel] = [ node for node in manifest.nodes.values() - if node.resource_type == 'model' + if node.resource_type == 'model' and node.config['materialized'] != 'ephemeral' ] # Empty model files have many missing parameters - for model in all_models: + for model in materialized_models: if not hasattr(model, 'name'): logging.error('Cannot parse model with id: "%s" - is the model file empty?', model.unique_id) raise SystemExit('Failed') if tag is None: - return all_models - return [model for model in all_models if tags_match(tag, model)] + return materialized_models + return [model for model in materialized_models if tags_match(tag, model)] def check_models_for_missing_column_types(dbt_typed_models: List[models.DbtModel]): From 8aad35994af892777a60194ec7a88053c25c57bc Mon Sep 17 00:00:00 2001 From: Padraig Alton Date: Fri, 9 Feb 2024 17:14:27 +0000 Subject: [PATCH 2/3] Reorder parse_models logic to ignore non-selected empty model files --- CHANGELOG.md | 3 +++ dbt2looker/parser.py | 11 +++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 676d862..7066f58 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,9 @@ Recent and upcoming changes to dbt2looker ### Added - support ephemeral models (#57) +### Changed +- only non-ephemeral models _selected by tag logic_ are checked to ensure the model files are not empty (instead of all models) (#57) + ## 0.11.0 ### Added - support label and hidden fields (#49) diff --git a/dbt2looker/parser.py b/dbt2looker/parser.py index 03045c9..f49477f 100644 --- a/dbt2looker/parser.py +++ b/dbt2looker/parser.py @@ -37,15 +37,18 @@ def parse_models(raw_manifest: dict, tag=None) -> List[models.DbtModel]: if node.resource_type == 'model' and node.config['materialized'] != 'ephemeral' ] + if tag is None: + selected_models = materialized_models + else: + selected_models = [model for model in materialized_models if tags_match(tag, model)] + # Empty model files have many missing parameters - for model in materialized_models: + for model in selected_models: if not hasattr(model, 'name'): logging.error('Cannot parse model with id: "%s" - is the model file empty?', model.unique_id) raise SystemExit('Failed') - if tag is None: - return materialized_models - return [model for model in materialized_models if tags_match(tag, model)] + return selected_models def check_models_for_missing_column_types(dbt_typed_models: List[models.DbtModel]): From 8c5a2c31ca3e5312eba64177b9144868c9215038 Mon Sep 17 00:00:00 2001 From: Padraig Alton Date: Fri, 19 Jan 2024 13:28:11 +0000 Subject: [PATCH 3/3] Added helpful warnings for manifest/catalog discrepancy --- CHANGELOG.md | 2 ++ dbt2looker/parser.py | 45 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7066f58..9740204 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Recent and upcoming changes to dbt2looker ## Unreleased ### Added - support ephemeral models (#57) +- warnings if there is a discrepancy between manifest and catalog (#5) +- more descriptive error message when a column's data type can't be inferred due to not being in the catalog ### Changed - only non-ephemeral models _selected by tag logic_ are checked to ensure the model files are not empty (instead of all models) (#57) diff --git a/dbt2looker/parser.py b/dbt2looker/parser.py index f49477f..9e29d5b 100644 --- a/dbt2looker/parser.py +++ b/dbt2looker/parser.py @@ -57,6 +57,33 @@ def check_models_for_missing_column_types(dbt_typed_models: List[models.DbtModel logging.debug('Model %s has no typed columns, no dimensions will be generated. %s', model.unique_id, model) +def compare_model_vs_node_columns(model: models.DbtModel, node: models.DbtCatalogNode): + model_columns = set(model.columns.keys()) # as defined in YML config + catalogued_columns = set(node.columns.keys()) # as defined in SQL + + # if the YML and SQL columns exactly match, return early + if not model_columns.symmetric_difference(catalogued_columns): + return + + if model_columns.issubset(catalogued_columns): + for undocumented_column in sorted(catalogued_columns.difference(model_columns)): + logging.warning( + f'Column {model.unique_id}.{undocumented_column} has not been documented in YML, ' + 'but is present in the catalog. You should add it to your YML config, ' + 'or (if it is not required) remove it from the model SQL file, run the model, ' + 'and run `dbt docs generate` again') + # after warning the user, return early + return + + # otherwise, there are columns defined in YML that don't match what's defined in SQL + for missing_column in sorted(model_columns.difference(catalogued_columns)): + logging.warning( + f'Column {model.unique_id}.{missing_column} documented in YML, ' + 'but is not defined in the DBT catalog. Check the model SQL file ' + 'and ensure you have run the model and `dbt docs generate`') + return # final return explicitly included for clarity + + def parse_typed_models(raw_manifest: dict, raw_catalog: dict, tag: Optional[str] = None): catalog_nodes = parse_catalog_nodes(raw_catalog) dbt_models = parse_models(raw_manifest, tag=tag) @@ -77,6 +104,11 @@ def parse_typed_models(raw_manifest: dict, raw_catalog: dict, tag: Optional[str] logging.warning( f'Model {model.unique_id} not found in catalog. No looker view will be generated. ' f'Check if model has materialized in {adapter_type} at {model.relation_name}') + else: + # we know that the model is included in the catalog - extract it + corresponding_catalog_node = catalog_nodes[model.unique_id] + # issue warnings if the catalog columns (defined via SQL) don't match what's documented in YML + compare_model_vs_node_columns(model, corresponding_catalog_node) # Update dbt models with data types from catalog dbt_typed_models = [ @@ -95,7 +127,18 @@ def parse_typed_models(raw_manifest: dict, raw_catalog: dict, tag: Optional[str] return dbt_typed_models +class ColumnNotInCatalogError(Exception): + def __init__(self, model_id: str, column_name: str): + super().__init__( + f'Column {column_name} not found in catalog for model {model_id}, ' + 'cannot find a data type for Looker. Is the column selected in the model SQL file, ' + 'and have you run the model since adding the column to it?') + + def get_column_type_from_catalog(catalog_nodes: Dict[str, models.DbtCatalogNode], model_id: str, column_name: str): node = catalog_nodes.get(model_id) column = None if node is None else node.columns.get(column_name) - return None if column is None else column.type + if column: + return column.type + # otherwise this will fail later when we try to map the data type to a Looker type + raise ColumnNotInCatalogError(model_id, column_name)