Skip to content

Commit

Permalink
Merge branch 'master' into looker_sdkerror_ancestors
Browse files Browse the repository at this point in the history
  • Loading branch information
mayurinehate authored Oct 10, 2024
2 parents 7a233db + a219316 commit fc99bc0
Show file tree
Hide file tree
Showing 66 changed files with 2,509 additions and 105 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,13 @@ export const EntityProfile = <T, U>({
{showBrowseBar && <EntityProfileNavBar urn={urn} entityType={entityType} />}
{entityData?.status?.removed === true && (
<Alert
message="This entity is not discoverable via search or lineage graph. Contact your DataHub admin for more information."
message={
<>
This entity is marked as soft-deleted, likely due to stateful ingestion or a manual
deletion command, and will not appear in search or lineage graphs. Contact your DataHub
admin for more information.
</>
}
banner
/>
)}
Expand Down
1 change: 0 additions & 1 deletion docker/airflow/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
#
# Feel free to modify this file to suit your needs.
---
version: '3'
x-airflow-common:
&airflow-common
image: ${AIRFLOW_IMAGE_NAME:-acryldata/airflow-datahub:latest}
Expand Down
1 change: 0 additions & 1 deletion docker/cassandra/docker-compose.cassandra.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Override to use Cassandra as a backing store for datahub-gms.
---
version: '3.8'
services:
cassandra:
hostname: cassandra
Expand Down
1 change: 0 additions & 1 deletion docker/docker-compose-with-cassandra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

# NOTE: This file does not build! No dockerfiles are set. See the README.md in this directory.
---
version: '3.9'
services:
datahub-frontend-react:
hostname: datahub-frontend-react
Expand Down
1 change: 0 additions & 1 deletion docker/docker-compose-without-neo4j.override.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
---
version: '3.9'
services:
datahub-gms:
env_file: datahub-gms/env/docker-without-neo4j.env
Expand Down
1 change: 0 additions & 1 deletion docker/docker-compose-without-neo4j.postgres.override.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Override to use PostgreSQL as a backing store for datahub-gms.
---
version: '3.9'
services:
datahub-gms:
env_file:
Expand Down
1 change: 0 additions & 1 deletion docker/docker-compose-without-neo4j.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

# NOTE: This file will cannot build! No dockerfiles are set. See the README.md in this directory.
---
version: '3.9'
services:
datahub-frontend-react:
hostname: datahub-frontend-react
Expand Down
1 change: 0 additions & 1 deletion docker/docker-compose.consumers-without-neo4j.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# Service definitions for standalone Kafka consumer containers.
version: '3.9'
services:
datahub-gms:
environment:
Expand Down
1 change: 0 additions & 1 deletion docker/docker-compose.consumers.dev.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
version: '3.9'
services:
datahub-mae-consumer:
image: acryldata/datahub-mae-consumer:debug
Expand Down
1 change: 0 additions & 1 deletion docker/docker-compose.consumers.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# Service definitions for standalone Kafka consumer containers.
version: '3.9'
services:
datahub-gms:
environment:
Expand Down
1 change: 0 additions & 1 deletion docker/docker-compose.dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
# To make a JVM app debuggable via IntelliJ, go to its env file and add JVM debug flags, and then add the JVM debug
# port to this file.
---
version: '3.9'
services:
datahub-frontend-react:
image: acryldata/datahub-frontend-react:head
Expand Down
1 change: 0 additions & 1 deletion docker/docker-compose.kafka-setup.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
# Empty docker compose for kafka-setup as we have moved kafka-setup back into the main compose
version: '3.9'
services:
1 change: 0 additions & 1 deletion docker/docker-compose.override.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Default override to use MySQL as a backing store for datahub-gms (same as docker-compose.mysql.yml).
---
version: '3.9'
services:
datahub-gms:
env_file: datahub-gms/env/docker.env
Expand Down
1 change: 0 additions & 1 deletion docker/docker-compose.tools.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Tools useful for operating & debugging DataHub.
---
version: '3.8'
services:
kafka-rest-proxy:
image: confluentinc/cp-kafka-rest:7.4.0
Expand Down
1 change: 0 additions & 1 deletion docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

# NOTE: This file does not build! No dockerfiles are set. See the README.md in this directory.
---
version: '3.9'
services:
datahub-frontend-react:
hostname: datahub-frontend-react
Expand Down
1 change: 0 additions & 1 deletion docker/ingestion/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
---
version: '3.5'
services:
ingestion:
build:
Expand Down
1 change: 0 additions & 1 deletion docker/mariadb/docker-compose.mariadb.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Override to use MariaDB as a backing store for datahub-gms.
---
version: '3.8'
services:
mariadb:
hostname: mariadb
Expand Down
1 change: 0 additions & 1 deletion docker/monitoring/docker-compose.consumers.monitoring.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
---
version: '3.8'
services:
datahub-mae-consumer:
environment:
Expand Down
1 change: 0 additions & 1 deletion docker/monitoring/docker-compose.monitoring.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
---
version: '3.9'
services:
datahub-frontend-react:
environment:
Expand Down
1 change: 0 additions & 1 deletion docker/mysql/docker-compose.mysql.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Override to use MySQL as a backing store for datahub-gms.
---
version: '3.8'
services:
mysql:
hostname: mysql
Expand Down
1 change: 0 additions & 1 deletion docker/quickstart/docker-compose-m1.quickstart.yml
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,6 @@ services:
volumes:
- zkdata:/var/lib/zookeeper/data
- zklogs:/var/lib/zookeeper/log
version: '3.9'
volumes:
broker: null
esdata: null
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,6 @@ services:
volumes:
- zkdata:/var/lib/zookeeper/data
- zklogs:/var/lib/zookeeper/log
version: '3.9'
volumes:
broker: null
esdata: null
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,6 @@ services:
volumes:
- zkdata:/var/lib/zookeeper/data
- zklogs:/var/lib/zookeeper/log
version: '3.9'
volumes:
broker: null
esdata: null
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,3 @@ services:
image: ${DATAHUB_MCE_CONSUMER_IMAGE:-acryldata/datahub-mce-consumer}:${DATAHUB_VERSION:-head}
ports:
- 9090:9090
version: '3.9'
1 change: 0 additions & 1 deletion docker/quickstart/docker-compose.consumers.quickstart.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,3 @@ services:
image: ${DATAHUB_MCE_CONSUMER_IMAGE:-acryldata/datahub-mce-consumer}:${DATAHUB_VERSION:-head}
ports:
- 9090:9090
version: '3.9'
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
services: {}
version: '3.9'
1 change: 0 additions & 1 deletion docker/quickstart/docker-compose.monitoring.quickstart.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,5 @@ services:
- 9089:9090
volumes:
- ../monitoring/prometheus.yaml:/etc/prometheus/prometheus.yml
version: '3.9'
volumes:
grafana-storage: null
1 change: 0 additions & 1 deletion docker/quickstart/docker-compose.quickstart.yml
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,6 @@ services:
volumes:
- zkdata:/var/lib/zookeeper/data
- zklogs:/var/lib/zookeeper/log
version: '3.9'
volumes:
broker: null
esdata: null
Expand Down
5 changes: 0 additions & 5 deletions docker/quickstart/generate_docker_quickstart.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,6 @@ def modify_docker_config(base_path, docker_yaml_config):
elif volumes[i].startswith("./"):
volumes[i] = "." + volumes[i]

# 10. Set docker compose version to 3.
# We need at least this version, since we use features like start_period for
# healthchecks (with services dependencies based on them) and shell-like variable interpolation.
docker_yaml_config["version"] = "3.9"


def dedup_env_vars(merged_docker_config):
for service in merged_docker_config["services"]:
Expand Down
1 change: 1 addition & 0 deletions docs/lineage/airflow.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ conn_id = datahub_rest_default # or datahub_kafka_default
| capture_tags_info | true | If true, the tags field of the DAG will be captured as DataHub tags. |
| capture_executions | true | If true, we'll capture task runs in DataHub in addition to DAG definitions. |
| materialize_iolets | true | Create or un-soft-delete all entities referenced in lineage. |
| render_templates | true | If true, jinja-templated fields will be automatically rendered to improve the accuracy of SQL statement extraction. |
| datajob_url_link | taskinstance | If taskinstance, the datajob url will be taskinstance link on airflow. It can also be grid. |
| |
| graceful_exceptions | true | If set to true, most runtime errors in the lineage backend will be suppressed and will not cause the overall task to fail. Note that configuration issues will still throw exceptions. |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,19 +43,24 @@ class DatahubLineageConfig(ConfigModel):

capture_executions: bool = False

datajob_url_link: DatajobUrl = DatajobUrl.TASKINSTANCE

# Note that this field is only respected by the lineage backend.
# The Airflow plugin v2 behaves as if it were set to True.
graceful_exceptions: bool = True

# The remaining config fields are only relevant for the v2 plugin.
enable_extractors: bool = True

# If true, ti.render_templates() will be called in the listener.
# Makes extraction of jinja-templated fields more accurate.
render_templates: bool = True

log_level: Optional[str] = None
debug_emitter: bool = False

disable_openlineage_plugin: bool = True

# Note that this field is only respected by the lineage backend.
# The Airflow plugin behaves as if it were set to True.
graceful_exceptions: bool = True

datajob_url_link: DatajobUrl = DatajobUrl.TASKINSTANCE

def make_emitter_hook(self) -> "DatahubGenericHook":
# This is necessary to avoid issues with circular imports.
from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook
Expand Down Expand Up @@ -84,6 +89,7 @@ def get_lineage_config() -> DatahubLineageConfig:
disable_openlineage_plugin = conf.get(
"datahub", "disable_openlineage_plugin", fallback=True
)
render_templates = conf.get("datahub", "render_templates", fallback=True)
datajob_url_link = conf.get(
"datahub", "datajob_url_link", fallback=DatajobUrl.TASKINSTANCE.value
)
Expand All @@ -102,4 +108,5 @@ def get_lineage_config() -> DatahubLineageConfig:
debug_emitter=debug_emitter,
disable_openlineage_plugin=disable_openlineage_plugin,
datajob_url_link=datajob_url_link,
render_templates=render_templates,
)
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,8 @@ def on_task_instance_running(
f"DataHub listener got notification about task instance start for {task_instance.task_id}"
)

task_instance = _render_templates(task_instance)
if self.config.render_templates:
task_instance = _render_templates(task_instance)

# The type ignore is to placate mypy on Airflow 2.1.x.
dagrun: "DagRun" = task_instance.dag_run # type: ignore[attr-defined]
Expand Down Expand Up @@ -478,7 +479,8 @@ def on_task_instance_finish(
) -> None:
dagrun: "DagRun" = task_instance.dag_run # type: ignore[attr-defined]

task_instance = _render_templates(task_instance)
if self.config.render_templates:
task_instance = _render_templates(task_instance)

# We must prefer the task attribute, in case modifications to the task's inlets/outlets
# were made by the execute() method.
Expand Down
1 change: 1 addition & 0 deletions metadata-ingestion/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -722,6 +722,7 @@
"snowflake-summary = datahub.ingestion.source.snowflake.snowflake_summary:SnowflakeSummarySource",
"snowflake-queries = datahub.ingestion.source.snowflake.snowflake_queries:SnowflakeQueriesSource",
"superset = datahub.ingestion.source.superset:SupersetSource",
"preset = datahub.ingestion.source.preset:PresetSource",
"tableau = datahub.ingestion.source.tableau.tableau:TableauSource",
"openapi = datahub.ingestion.source.openapi:OpenApiSource",
"metabase = datahub.ingestion.source.metabase:MetabaseSource",
Expand Down
7 changes: 7 additions & 0 deletions metadata-ingestion/src/datahub/cli/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,13 @@ def _get_config_from_env() -> Tuple[Optional[str], Optional[str]]:
return url or host, token


def require_config_from_env() -> Tuple[str, Optional[str]]:
host, token = _get_config_from_env()
if host is None:
raise MissingConfigError("No GMS host was provided in env variables.")
return host, token


def load_client_config() -> DatahubClientConfig:
gms_host_env, gms_token_env = _get_config_from_env()
if gms_host_env:
Expand Down
16 changes: 12 additions & 4 deletions metadata-ingestion/src/datahub/cli/delete_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,10 +338,18 @@ def by_filter(
# TODO: add some validation on entity_type

if not force and not soft and not dry_run:
click.confirm(
"This will permanently delete data from DataHub. Do you want to continue?",
abort=True,
)
if only_soft_deleted:
click.confirm(
"This will permanently delete data from DataHub. Do you want to continue?",
abort=True,
)
else:
click.confirm(
"Hard deletion will permanently delete data from DataHub and can be slow. "
"We generally recommend using soft deletes instead. "
"Do you want to continue?",
abort=True,
)

graph = get_default_graph()
logger.info(f"Using {graph}")
Expand Down
6 changes: 6 additions & 0 deletions metadata-ingestion/src/datahub/emitter/rest_emitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@ def __init__(
):
if not gms_server:
raise ConfigurationError("gms server is required")
if gms_server == "__from_env__" and token is None:
# HACK: similar to what we do with system auth, we transparently
# inject the config in here. Ideally this should be done in the
# config loader or by the caller, but it gets the job done for now.
gms_server, token = config_utils.require_config_from_env()

self._gms_server = fixup_gms_url(gms_server)
self._token = token
self.server_config: Dict[str, Any] = {}
Expand Down
17 changes: 9 additions & 8 deletions metadata-ingestion/src/datahub/ingestion/graph/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,27 +214,28 @@ def _get_generic(self, url: str, params: Optional[Dict] = None) -> Dict:
def _post_generic(self, url: str, payload_dict: Dict) -> Dict:
return self._send_restli_request("POST", url, json=payload_dict)

def _make_rest_sink_config(self) -> "DatahubRestSinkConfig":
from datahub.ingestion.sink.datahub_rest import (
DatahubRestSinkConfig,
RestSinkMode,
)
def _make_rest_sink_config(
self, extra_config: Optional[Dict] = None
) -> "DatahubRestSinkConfig":
from datahub.ingestion.sink.datahub_rest import DatahubRestSinkConfig

# This is a bit convoluted - this DataHubGraph class is a subclass of DatahubRestEmitter,
# but initializing the rest sink creates another rest emitter.
# TODO: We should refactor out the multithreading functionality of the sink
# into a separate class that can be used by both the sink and the graph client
# e.g. a DatahubBulkRestEmitter that both the sink and the graph client use.
return DatahubRestSinkConfig(**self.config.dict(), mode=RestSinkMode.ASYNC)
return DatahubRestSinkConfig(**self.config.dict(), **(extra_config or {}))

@contextlib.contextmanager
def make_rest_sink(
self, run_id: str = _GRAPH_DUMMY_RUN_ID
self,
run_id: str = _GRAPH_DUMMY_RUN_ID,
extra_sink_config: Optional[Dict] = None,
) -> Iterator["DatahubRestSink"]:
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.sink.datahub_rest import DatahubRestSink

sink_config = self._make_rest_sink_config()
sink_config = self._make_rest_sink_config(extra_config=extra_sink_config)
with DatahubRestSink(PipelineContext(run_id=run_id), sink_config) as sink:
yield sink
if sink.report.failures:
Expand Down
2 changes: 2 additions & 0 deletions metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ class DataHubRestSinkReport(SinkReport):
gms_version: Optional[str] = None
pending_requests: int = 0

async_batches_prepared: int = 0
async_batches_split: int = 0

main_thread_blocking_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
Expand Down Expand Up @@ -260,6 +261,7 @@ def _emit_batch_wrapper(
events.append(event)

chunks = self.emitter.emit_mcps(events)
self.report.async_batches_prepared += 1
if chunks > 1:
self.report.async_batches_split += chunks
logger.info(
Expand Down
Loading

0 comments on commit fc99bc0

Please sign in to comment.