-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add dagster pipes example with databricks * fix notebook
- Loading branch information
Showing
5 changed files
with
197 additions
and
66 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,9 @@ | |
from hooli_data_eng.resources.sensor_file_managers import s3FileSystem, LocalFileSystem | ||
from hooli_data_eng.resources.sensor_smtp import LocalEmailAlert, SESEmailAlert | ||
|
||
from databricks.sdk import WorkspaceClient | ||
from dagster_databricks import PipesDatabricksClient | ||
from unittest import mock | ||
|
||
# Resources represent external systems and, and specifically IO Managers | ||
# tell dagster where our assets should be materialized. In dagster | ||
|
@@ -38,6 +41,14 @@ def get_env(): | |
return "PROD" | ||
return "LOCAL" | ||
|
||
client = mock.MagicMock() | ||
|
||
if get_env() == "PROD": | ||
# Databricks Client | ||
client = WorkspaceClient( | ||
host=os.environ["DATABRICKS_HOST"], | ||
token=os.environ["DATABRICKS_TOKEN"], | ||
) | ||
|
||
# The dbt file dbt_project/config/profiles.yaml | ||
# specifies what databases to targets, and locally will | ||
|
@@ -68,6 +79,7 @@ def get_env(): | |
"email": LocalEmailAlert( | ||
smtp_email_to=["[email protected]"], smtp_email_from="[email protected]" | ||
), | ||
"pipes_client": ResourceDefinition.none_resource(), | ||
}, | ||
"BRANCH": { | ||
"io_manager": SnowflakePandasIOManager( | ||
|
@@ -93,6 +105,7 @@ def get_env(): | |
region_name="us-west-2", s3_bucket="hooli-demo-branch" | ||
), | ||
"email": ResourceDefinition.none_resource(), | ||
"pipes_client": ResourceDefinition.none_resource(), | ||
}, | ||
"PROD": { | ||
"io_manager": SnowflakePandasIOManager( | ||
|
@@ -122,5 +135,6 @@ def get_env(): | |
smtp_username=EnvVar("SMTP_USERNAME"), | ||
smtp_password=EnvVar("SMTP_PASSWORD"), | ||
), | ||
"pipes_client": PipesDatabricksClient(client) | ||
}, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
#### | ||
# This reprsents an "external" script that is mostly independent of dagster | ||
# that dagster will orchestrate via pipes | ||
# This script must be uploaded to Databricks manually (or via some other process) | ||
# `dagster_pipes` must be available in the databricks python environment | ||
|
||
from dagster_pipes import PipesDbfsContextLoader, PipesDbfsMessageWriter, PipesContext, open_dagster_pipes | ||
import random | ||
|
||
with open_dagster_pipes(context_loader=PipesDbfsContextLoader(), message_writer=PipesDbfsMessageWriter()) as context: | ||
|
||
sample_rate = context.get_extra("sample_rate") | ||
|
||
# Stream log message back to Dagster | ||
context.log.info(f"Using sample rate: {sample_rate}") | ||
|
||
# ... your code that computes and persists the asset | ||
|
||
|
||
# Stream asset materialization metadata and data version back to Dagster. | ||
# This should be called after you've computed and stored the asset value. We | ||
# omit the asset key here because there is only one asset in scope, but for | ||
# multi-assets you can pass an `asset_key` parameter. | ||
context.report_asset_materialization( | ||
metadata={"some_spark_metric": random.choice(["scranton", "new york", "tallahassee"])}, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from dagster import materialize | ||
from hooli_data_eng.assets.forecasting import databricks_asset | ||
from hooli_data_eng.resources import client | ||
from dagster_databricks import PipesDatabricksClient | ||
|
||
# to meaningfully run this test you must set | ||
# DAGSTER_CLOUD_DEPLOYMENT_NAME="data-eng-prod" | ||
# and also set | ||
# DATABRICKS_HOST="your host" | ||
# DATABRICKS_TOKEN="your token" | ||
|
||
result = materialize( | ||
[databricks_asset], | ||
resources={ | ||
"pipes_client": PipesDatabricksClient( | ||
client, | ||
) | ||
}, | ||
raise_on_error=False, | ||
) |